Re: [PATCH v2 8/8] x86/ioapic: Generate RTE directly from parent irqchip's MSI message

From: David Woodhouse
Date: Fri Oct 23 2020 - 13:04:17 EST


On Fri, 2020-10-23 at 00:10 +0200, Thomas Gleixner wrote:
> > -static void mp_setup_entry(struct irq_cfg *cfg, struct
> > mp_chip_data *data,
> > - struct IO_APIC_route_entry *entry)
> > +static void mp_setup_entry(struct irq_data *irq_data, struct
> > mp_chip_data *data)
> > {
> > + struct IO_APIC_route_entry *entry = &data->entry;
> > +
> > memset(entry, 0, sizeof(*entry));
> > - entry->delivery_mode = apic->irq_delivery_mode;
> > - entry->dest_mode = apic->irq_dest_mode;
> > - entry->dest = cfg->dest_apicid & 0xff;
> > - entry->virt_ext_dest = cfg->dest_apicid >> 8;
> > - entry->vector = cfg->vector;
> > +
> > + mp_swizzle_msi_dest_bits(irq_data, entry);
> > +
> > entry->trigger = data->trigger;
> > entry->polarity = data->polarity;
> > /*
>
> does not make sense. It did not make sense before either, but now it
> does even make less sense.
>
> During allocation this only needs to setup the I/O-APIC specific bits
> (trigger, polarity, mask). The rest is filled in when the actual
> activation happens. Nothing writes that entry _before_ activation.
>
> /me goes to mop up more

Yeah... that code was indeed a pile of crap before I looked at it,
wasn't it? And I indeed failed to spot it and mop it up as I touched
it.

Here's the version I've just pushed to my tree, which I'll test
properly both with and without IR over the weekend before posting v3.

There is no way that bit swizzling is every going to be anything short
of fugly. That's just the reality of the hardware. Even doing it with
bitfields is just going to be masking the issue by having structure
definitions that don't actually match the I/OAPIC documentation. Better
to be up front about it. I've added more words though; more words
always help...

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/ext_dest_id

From f912b52996d381fd8a631dd10c713772c2ade478 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@xxxxxxxxxxxx>
Date: Thu, 8 Oct 2020 15:44:42 +0100
Subject: [PATCH 10/19] x86/ioapic: Generate RTE directly from parent irqchip's
MSI message

The I/OAPIC generates an MSI cycle with address/data bits taken from its
Redirection Table Entry in some combination which used to make sense,
but now is just a bunch of bits which get passed through in some
seemingly arbitrary order.

Instead of making IRQ remapping drivers directly frob the I/OAPIC RTE,
let them just do their job and generate an MSI message. The bit
swizzling to turn that MSI message into the IOAPIC's RTE is the same in
all cases, since it's a function of the I/OAPIC hardware. The IRQ
remappers have no real need to get involved with that.

The only slight caveat is that the I/OAPIC is interpreting some of
those fields too, and it does want the 'vector' field to be unique
to make EOI work. The AMD IOMMU happens to put its IRTE index in the
bits that the I/OAPIC thinks are the vector field, and accommodates
this requirement by reserving the first 32 indices for the I/OAPIC.
The Intel IOMMU doesn't actually use the bits that the I/OAPIC thinks
are the vector field, so it fills in the 'pin' value there instead.

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
arch/x86/include/asm/hw_irq.h | 11 ++--
arch/x86/include/asm/msidef.h | 2 +
arch/x86/kernel/apic/io_apic.c | 81 ++++++++++++++++++++++-------
drivers/iommu/amd/iommu.c | 14 -----
drivers/iommu/hyperv-iommu.c | 31 -----------
drivers/iommu/intel/irq_remapping.c | 19 ++-----
6 files changed, 74 insertions(+), 84 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index a4aeeaace040..aabd8f1b6bb0 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -45,12 +45,11 @@ enum irq_alloc_type {
};

struct ioapic_alloc_info {
- int pin;
- int node;
- u32 trigger : 1;
- u32 polarity : 1;
- u32 valid : 1;
- struct IO_APIC_route_entry *entry;
+ int pin;
+ int node;
+ u32 trigger : 1;
+ u32 polarity : 1;
+ u32 valid : 1;
};

struct uv_alloc_info {
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
index ee2f8ccc32d0..37c3d2d492c9 100644
--- a/arch/x86/include/asm/msidef.h
+++ b/arch/x86/include/asm/msidef.h
@@ -18,6 +18,7 @@
#define MSI_DATA_DELIVERY_MODE_SHIFT 8
#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
+#define MSI_DATA_DELIVERY_MODE_MASK (3 << MSI_DATA_DELIVERY_MODE_SHIFT)

#define MSI_DATA_LEVEL_SHIFT 14
#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
@@ -37,6 +38,7 @@
#define MSI_ADDR_DEST_MODE_SHIFT 2
#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT)
#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT)
+#define MSI_ADDR_DEST_MODE_MASK (1 << MSI_DATA_DELIVERY_MODE_SHIFT)

#define MSI_ADDR_REDIRECTION_SHIFT 3
#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 54f6a029b1d1..b9e6236af833 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,6 +48,7 @@
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
#include <linux/memblock.h>
+#include <linux/msi.h>

#include <asm/irqdomain.h>
#include <asm/io.h>
@@ -63,6 +64,7 @@
#include <asm/setup.h>
#include <asm/irq_remapping.h>
#include <asm/hw_irq.h>
+#include <asm/msidef.h>

#include <asm/apic.h>

@@ -1851,22 +1853,64 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
eoi_ioapic_pin(data->entry.vector, data);
}

+static void mp_swizzle_msi_dest_bits(struct irq_data *irq_data,
+ struct IO_APIC_route_entry *rte)
+{
+ struct msi_msg msg;
+ u32 *entry = (u32 *)rte;
+
+ /*
+ * They're in a bit of a random order for historical reasons, but
+ * the I/OAPIC is just a device for turning interrupt lines into
+ * MSIs, and various bits of the MSI addr/data are just swizzled
+ * into/from the bits of Redirection Table Entry. So let the
+ * upstream irqdomain (be it interrupt remapping or otherwise)
+ * compose the MSI message, and we'll shift the bits into the
+ * appropriate place in the RTE.
+ */
+ irq_chip_compose_msi_msg(irq_data, &msg);
+
+ /*
+ * The low 12 bits of the RTE were historically the vector,
+ * delivery_mode and destination mode. Which come from the
+ * low 8 bits of the MSI data, the *next* 3 bits of the MSI
+ * data, and bit 2 of the MSI address (which thus has to be
+ * shifted up by 9 to land in the right place in bit 11 of
+ * the RTE).
+ *
+ * With Interrupt Remapping of course many bits in the MSI
+ * have different meanings but the bit-swizzling of the
+ * I/OAPIC hardware remains the same.
+ */
+ entry[0] &= 0xfffff000;
+ entry[0] |= (msg.data & (MSI_DATA_DELIVERY_MODE_MASK |
+ MSI_DATA_VECTOR_MASK));
+ entry[0] |= (msg.address_lo & MSI_ADDR_DEST_MODE_MASK) << 9;
+
+ /*
+ * Top 16 bits of the RTE are the destination and extended
+ * destination ID fields, which come from bits 19-4 of the
+ * MSI address.
+ */
+ entry[1] &= 0xffff;
+ entry[1] |= (msg.address_lo & MSI_ADDR_DEST_ID_MASK) << 12;
+}
+
+
static void ioapic_configure_entry(struct irq_data *irqd)
{
struct mp_chip_data *mpd = irqd->chip_data;
- struct irq_cfg *cfg = irqd_cfg(irqd);
struct irq_pin_list *entry;

/*
- * Only update when the parent is the vector domain, don't touch it
- * if the parent is the remapping domain. Check the installed
- * ioapic chip to verify that.
+ * The polarity, trigger and mask bits have already been
+ * set up at allocation time by mp_setup_entry(). What
+ * remains on activation and set_affinity is to set up
+ * the various destination bits which are obtained from
+ * the upstream irq domain's generated MSI message.
*/
- if (irqd->chip == &ioapic_chip) {
- mpd->entry.dest = cfg->dest_apicid & 0xff;
- mpd->entry.virt_ext_dest = cfg->dest_apicid >> 8;
- mpd->entry.vector = cfg->vector;
- }
+ mp_swizzle_msi_dest_bits(irqd, &mpd->entry);
+
for_each_irq_pin(entry, mpd->irq_2_pin)
__ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
}
@@ -2949,15 +2993,16 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
}
}

-static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
- struct IO_APIC_route_entry *entry)
+static void mp_setup_entry(struct irq_data *irq_data, struct mp_chip_data *data)
{
+ struct IO_APIC_route_entry *entry = &data->entry;
+
+ /*
+ * The destination bits get set up by ioapic_configure_entry()
+ * when the IRQ is activated. For now just set up the I/OAPIC
+ * specific fields.
+ */
memset(entry, 0, sizeof(*entry));
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->dest = cfg->dest_apicid & 0xff;
- entry->virt_ext_dest = cfg->dest_apicid >> 8;
- entry->vector = cfg->vector;
entry->trigger = data->trigger;
entry->polarity = data->polarity;
/*
@@ -2995,7 +3040,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
if (!data)
return -ENOMEM;

- info->ioapic.entry = &data->entry;
ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
if (ret < 0) {
kfree(data);
@@ -3013,8 +3057,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);

local_irq_save(flags);
- if (info->ioapic.entry)
- mp_setup_entry(cfg, data, info->ioapic.entry);
+ mp_setup_entry(irq_data, data);
mp_register_handler(virq, data->trigger);
if (virq < nr_legacy_irqs())
legacy_pic->mask(virq);
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index ef64e01f66d7..13d0a8f42d56 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3597,7 +3597,6 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,
{
struct irq_2_irte *irte_info = &data->irq_2_irte;
struct msi_msg *msg = &data->msi_entry;
- struct IO_APIC_route_entry *entry;
struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];

if (!iommu)
@@ -3611,19 +3610,6 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data,

switch (info->type) {
case X86_IRQ_ALLOC_TYPE_IOAPIC:
- /* Setup IOAPIC entry */
- entry = info->ioapic.entry;
- info->ioapic.entry = NULL;
- memset(entry, 0, sizeof(*entry));
- entry->vector = index;
- entry->mask = 0;
- entry->trigger = info->ioapic.trigger;
- entry->polarity = info->ioapic.polarity;
- /* Mask level triggered irqs. */
- if (info->ioapic.trigger)
- entry->mask = 1;
- break;
-
case X86_IRQ_ALLOC_TYPE_HPET:
case X86_IRQ_ALLOC_TYPE_PCI_MSI:
case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c
index 12ec31534995..3a674262cc91 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-iommu.c
@@ -40,7 +40,6 @@ static int hyperv_ir_set_affinity(struct irq_data *data,
{
struct irq_data *parent = data->parent_data;
struct irq_cfg *cfg = irqd_cfg(data);
- struct IO_APIC_route_entry *entry;
int ret;

/* Return error If new irq affinity is out of ioapic_max_cpumask. */
@@ -51,9 +50,6 @@ static int hyperv_ir_set_affinity(struct irq_data *data,
if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
return ret;

- entry = data->chip_data;
- entry->dest = cfg->dest_apicid;
- entry->vector = cfg->vector;
send_cleanup_vector(cfg);

return 0;
@@ -89,20 +85,6 @@ static int hyperv_irq_remapping_alloc(struct irq_domain *domain,

irq_data->chip = &hyperv_ir_chip;

- /*
- * If there is interrupt remapping function of IOMMU, setting irq
- * affinity only needs to change IRTE of IOMMU. But Hyper-V doesn't
- * support interrupt remapping function, setting irq affinity of IO-APIC
- * interrupts still needs to change IO-APIC registers. But ioapic_
- * configure_entry() will ignore value of cfg->vector and cfg->
- * dest_apicid when IO-APIC's parent irq domain is not the vector
- * domain.(See ioapic_configure_entry()) In order to setting vector
- * and dest_apicid to IO-APIC register, IO-APIC entry pointer is saved
- * in the chip_data and hyperv_irq_remapping_activate()/hyperv_ir_set_
- * affinity() set vector and dest_apicid directly into IO-APIC entry.
- */
- irq_data->chip_data = info->ioapic.entry;
-
/*
* Hypver-V IO APIC irq affinity should be in the scope of
* ioapic_max_cpumask because no irq remapping support.
@@ -119,22 +101,9 @@ static void hyperv_irq_remapping_free(struct irq_domain *domain,
irq_domain_free_irqs_common(domain, virq, nr_irqs);
}

-static int hyperv_irq_remapping_activate(struct irq_domain *domain,
- struct irq_data *irq_data, bool reserve)
-{
- struct irq_cfg *cfg = irqd_cfg(irq_data);
- struct IO_APIC_route_entry *entry = irq_data->chip_data;
-
- entry->dest = cfg->dest_apicid;
- entry->vector = cfg->vector;
-
- return 0;
-}
-
static const struct irq_domain_ops hyperv_ir_domain_ops = {
.alloc = hyperv_irq_remapping_alloc,
.free = hyperv_irq_remapping_free,
- .activate = hyperv_irq_remapping_activate,
};

static int __init hyperv_prepare_irq_remapping(void)
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 0cfce1d3b7bb..511dfb4884bc 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1265,7 +1265,6 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
struct irq_alloc_info *info,
int index, int sub_handle)
{
- struct IR_IO_APIC_route_entry *entry;
struct irte *irte = &data->irte_entry;
struct msi_msg *msg = &data->msi_entry;

@@ -1281,23 +1280,15 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data,
irte->avail, irte->vector, irte->dest_id,
irte->sid, irte->sq, irte->svt);

- entry = (struct IR_IO_APIC_route_entry *)info->ioapic.entry;
- info->ioapic.entry = NULL;
- memset(entry, 0, sizeof(*entry));
- entry->index2 = (index >> 15) & 0x1;
- entry->zero = 0;
- entry->format = 1;
- entry->index = (index & 0x7fff);
/*
* IO-APIC RTE will be configured with virtual vector.
* irq handler will do the explicit EOI to the io-apic.
*/
- entry->vector = info->ioapic.pin;
- entry->mask = 0; /* enable IRQ */
- entry->trigger = info->ioapic.trigger;
- entry->polarity = info->ioapic.polarity;
- if (info->ioapic.trigger)
- entry->mask = 1; /* Mask level triggered irqs. */
+ msg->data = info->ioapic.pin;
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+ MSI_ADDR_IR_INDEX1(index) |
+ MSI_ADDR_IR_INDEX2(index);
break;

case X86_IRQ_ALLOC_TYPE_HPET:
--
2.17.1

Attachment: smime.p7s
Description: S/MIME cryptographic signature