Re: [PATCH] irqchip/gicv3: Workaround for NVIDIA erratum T241-FABRIC-4

From: Shanker Donthineni
Date: Sun Mar 12 2023 - 23:50:10 EST


Thanks Marc for a quick and valuable feedback.

On 3/12/23 08:43, Marc Zyngier wrote:
External email: Use caution opening links or attachments


On Fri, 10 Mar 2023 14:16:34 +0000,
Shanker Donthineni <sdonthineni@xxxxxxxxxx> wrote:

Hi Marc,

On 3/7/23 02:32, Marc Zyngier wrote:
External email: Use caution opening links or attachments


On Mon, 06 Mar 2023 01:31:48 +0000,
Shanker Donthineni <sdonthineni@xxxxxxxxxx> wrote:

The purpose of this patch is to address the T241 erratum T241-FABRIC-4,
which causes unexpected behavior in the GIC when multiple transactions

nit: "The purpose of this patch" is superfluous. Instead, write
something like:

"The T241 platform suffers from the T241-FABRIC-4 erratum which
causes..."

I'll fix in v2 patch.

are received simultaneously from different sources. This hardware issue
impacts NVIDIA server platforms that use more than two T241 chips
interconnected. Each chip has support for 320 {E}SPIs.

This issue occurs when multiple packets from different GICs are
incorrectly interleaved at the target chip. The erratum text below
specifies exactly what can cause multiple transfer packets susceptible
to interleaving and GIC state corruption. GIC state corruption can
lead to a range of problems, including kernel panics, and unexpected
behavior.

From the erratum text:
"In some cases, inter-socket AXI4 Stream packets with multiple
transfers, may be interleaved by the fabric when presented to ARM
Generic Interrupt Controller. GIC expects all transfers of a packet
to be delivered without any interleaving.

The following GICv3 commands may result in multiple transfer packets
over inter-socket AXI4 Stream interface:
- Register reads from GICD_I* and GICD_N*
- Register writes to 64-bit GICD registers other than GICD_IROUTERn*
- ITS command MOVALL

Does is also affect cross-chip traffic such as SPI deactivation?

No, it is not impacted.


Multiple commands in GICv4+ utilize multiple transfer packets,
including VMOVP, VMOVI and VMAPP.

This issue impacts system configurations with more than 2 sockets,
that require multi-transfer packets to be sent over inter-socket
AXI4 Stream interface between GIC instances on different sockets.
GICv4 cannot be supported. GICv3 SW model can only be supported
with the workaround. Single and Dual socket configurations are not
impacted by this issue and support GICv3 and GICv4."

Do you have a public link to this erratum? This is really something
that we should be go back to when changing things in the GIC code
(should we ever use MOVALL, for example).

https://developer.nvidia.com/docs/t241-fabric-4/nvidia-t241-fabric-4-errata.pdf

Great. Please add this to the commit message and a comment next to the
workaround code.

I'll do it.


[...]

+static inline void __iomem *gic_dist_base_read_alias(irq_hw_number_t intid)
+{
+ struct dist_base_alias *base_alias;
+ int i;
+
+ if (static_branch_unlikely(&gic_nvidia_t241_erratum)) {
+ base_alias = gic_data.base_read_aliases;
+ for (i = 0; i < gic_data.nr_dist_base_aliases; i++) {
+ if (base_alias->base &&
+ (intid >= base_alias->intid_start) &&
+ (intid <= base_alias->intid_end)) {
+ return base_alias->base;
+ }
+ base_alias++;
+ }
+ }

Each distributor has the exact same number of SPIs. So why isn't this
just a division that gives you a distributor number?


I considered creating a generic function that could potentially be
utilized in the future for other Workarounds (WARs).

I'll change to this in v2.

static inline void __iomem *gic_dist_base_alias(irq_hw_number_t intid)
{
u32 chip;

if (static_branch_unlikely(&gic_nvidia_t241_erratum)) {
/**
* {E}SPI mappings for all 4 chips
* Chip0 = 32-351
* Chip1 = 52-671

s/52/352/, right?
Thanks, typo, fix in v2.


* Chip2 = 672-991
* Chip3 = 4096-4415
*/
switch (__get_intid_range(intid)) {
case SPI_RANGE:
chip = (intid - 32) / 320;
break;
case ESPI_RANGE:
chip = 3;
break;
default:
unreachable();
}
BUG_ON(!t241_dist_base_alias[chip]);

You can drop this BUG_ON(), and replace it with on at probe time.

return t241_dist_base_alias[chip];
}

return gic_data.dist_base;
}

Yup, that's much better.


+
+ return gic_data.dist_base;
+}
+
static inline void __iomem *gic_dist_base(struct irq_data *d)
{
switch (get_intid_range(d)) {
@@ -346,7 +377,7 @@ static int gic_peek_irq(struct irq_data *d, u32 offset)
if (gic_irq_in_rdist(d))
base = gic_data_rdist_sgi_base();
else
- base = gic_data.dist_base;
+ base = gic_dist_base_read_alias(irqd_to_hwirq(d));

return !!(readl_relaxed(base + offset + (index / 32) * 4) & mask);
}
@@ -580,6 +611,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
enum gic_intid_range range;
unsigned int irq = gic_irq(d);
void __iomem *base;
+ void __iomem *base_read_alias;
u32 offset, index;
int ret;

@@ -594,14 +626,17 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
type != IRQ_TYPE_LEVEL_HIGH && type != IRQ_TYPE_EDGE_RISING)
return -EINVAL;

- if (gic_irq_in_rdist(d))
+ if (gic_irq_in_rdist(d)) {
base = gic_data_rdist_sgi_base();
- else
+ base_read_alias = base;
+ } else {
base = gic_data.dist_base;
+ base_read_alias = gic_dist_base_read_alias(irqd_to_hwirq(d));
+ }

offset = convert_offset_index(d, GICD_ICFGR, &index);
-
- ret = gic_configure_irq(index, type, base + offset, NULL);
+ ret = gic_configure_irq(index, type, base + offset, NULL,
+ base_read_alias + offset);
if (ret && (range == PPI_RANGE || range == EPPI_RANGE)) {
/* Misconfigured PPIs are usually not fatal */
pr_warn("GIC: PPI INTID%d is secure or misconfigured\n", irq);
@@ -1719,6 +1754,70 @@ static bool gic_enable_quirk_hip06_07(void *data)
return false;
}

+static bool gic_enable_quirk_nvidia_t241(void *data)
+{
+#ifdef CONFIG_ACPI
+ struct dist_base_alias *base_alias;
+ struct acpi_table_header *madt;
+ int i, intid, nchips = 0;
+ acpi_status status;
+ phys_addr_t phys;
+
+ status = acpi_get_table(ACPI_SIG_MADT, 0, &madt);
+ if (ACPI_FAILURE(status))
+ return false;
+
+ /* Check NVIDIA OEM ID */
+ if (memcmp(madt->oem_id, "NVIDIA", 6)) {

What guarantees do we have that this string will always be present?
"oem_id" is usually updated to reflect the integrator, not the
silicon vendor.


Our company provides UEFI firmware porting guidelines to OEMs that
ensure the MADT table generation, along with the ACPI header, remains
unaltered. Thanks to your input, we are now looking into alternative
approaches for identifying platform types and removing our dependence
on ACPI. Specifically, we are interested in utilizing the SMCCC API
to detect the CHIP. Determine whether the individual chips are present
by referring to the GICR regions described in MADT.

Seems like a reasonable alternative.



+ acpi_put_table(madt);
+ return false;
+ }
+
+ /* Find the number of chips based on OEM_TABLE_ID */
+ if ((!memcmp(madt->oem_table_id, "T241x3", 6)) ||
+ (!memcmp(madt->oem_table_id, "T241c3", 6))) {
+ nchips = 3;
+ } else if ((!memcmp(madt->oem_table_id, "T241x4", 6)) ||
+ (!memcmp(madt->oem_table_id, "T241c4", 6))) {
+ nchips = 4;
+ }

Same question for these. This seems pretty fragile.

This can be avoid for the SMCCC based platform detection.

+
+ acpi_put_table(madt);
+ if (nchips < 3)
+ return false;
+
+ base_alias = kmalloc_array(nchips, sizeof(*base_alias),
+ GFP_KERNEL | __GFP_ZERO);

You are fully initialising the structures, right? So why the
__GFP_ZERO?
Yes, not needed. will use the staic array since size is small after
removing INTID_start/end feilds.


+ if (!base_alias)
+ return false;
+
+ gic_data.base_read_aliases = base_alias;
+ gic_data.nr_dist_base_aliases = nchips;
+
+ /**
+ * Setup GICD alias and {E}SPIs range for each chip
+ * {E}SPI blocks mappings:
+ * Chip0 = 00-09
+ * Chip1 = 10-19
+ * Chip2 = 20-29
+ * Chip3 = 30-39

What are these ranges? From the code below, I can (sort of) guess that
each chip has 10 registers in the SPI/ESPI range, with chips 0-1
dealing with SPIs, and chips 2-3 dealing with ESPIs.

It would be a lot clearer if you indicated the actual INTID ranges.
Agree.


+ */
+ for (i = 0; i < nchips; i++, base_alias++) {
+ phys = ((1ULL << 44) * i) | 0x23580000;

Where is this address coming from? Can it be inferred from the MADT?
It would also be a lot more readable if written as:

#define CHIP_MASK GENMASK_ULL(45, 44)
#define CHIP_ALIAS_BASE 0x23580000

I'll define macros for constants. Use the offset, global GICD-PHYS,
and CHIP number to get the alias addressses.

#define T241_CHIPN_MASK GENMASK_ULL(45, 44)
#define T241_CHIP_GICDA_OFFSET 0x1580000

phys = gic_data.dist_phys_base + T241_CHIP_GICDA_OFFSET;
phys |= FIELD_PREP(T241_CHIPN_MASK, i);


phys = CHIP_ALIAS_BASE;
phys |= FIELD_PREP(CHIP_MASK, i);

+ base_alias->base = ioremap(phys, SZ_64K);
+ WARN_ON(!base_alias->base);
+
+ intid = i < 3 ? 32 + i * 10 * 32 : ESPI_BASE_INTID;
+ base_alias->intid_start = intid;
+ base_alias->intid_end = intid + 10 * 32 - 1;

This really is obfuscated. And it also shows that we really don't need
the INTID ranges in the data structure. You can easily get to the chip
number with something like:
ACK


switch (__get_intid_range(intid)) {
case SPI_RANGE:
chip = (intid - 32) / 320;
break;
case ESPI_RANGE:
chip = (intid - ESPI_BASE_INTID) / 320;
break;
}

alias = base_alias[chip];

Bonus point if you add a #define for the magic numbers.

ACK

+ }
+ static_branch_enable(&gic_nvidia_t241_erratum);
+ return true;
+#else
+ return false;
+#endif
+}

How about moving the whole function under #ifdef CONFIG_ACPI?


If you're not satisfied with SMCCC-based platform detection, I'll
make the necessary changes. We value your input and would appreciate
your opinion on whether we should use SMCCC or ACPI-OEM-ID based
platform detection. Our preference is to go with SMC if that's
agreeable to you.

If you can guarantee that this FW-based discovery will always be
available, then this is a more robust way of doing it.

We can assure you that the FW-based discovery will be available on
T241 platform. Thank you for acknowledging the alternative mechanism.



#define SMCCC_JEP106_BANK_ID(v) FIELD_GET(GENMASK(30, 24), (v))
#define SMCCC_JEP106_ID_CODE(v) FIELD_GET(GENMASK(22, 16), (v))
#define SMCCC_JEP106_SOC_ID(v) FIELD_GET(GENMASK(15, 0), (v))

#define JEP106_NVIDIA_BANK_ID 0x3
#define JEP106_NVIDIA_ID_CODE 0x6b
#define T241_CHIPN_MASK GENMASK_ULL(45, 44)
#define T241_CHIP_GICDA_OFFSET 0x1580000
#define T241_CHIP_ID 0x241

static bool gic_enable_quirk_nvidia_t241(void *data)
{
unsigned long chip_bmask = 0;
struct arm_smccc_res res;
phys_addr_t phys;
u32 i;

if ((arm_smccc_get_version() < ARM_SMCCC_VERSION_1_2) ||
(arm_smccc_1_1_get_conduit() == SMCCC_CONDUIT_NONE)) {
return false;
}

arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
ARM_SMCCC_ARCH_SOC_ID, &res);
if ((s32)res.a0 < 0)
return false;

arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 0, &res);
if ((s32)res.a0 < 0)
return false;

Most of this should probably directly come from the soc_id
infrastructure. It would need to probe early and expose the low-level
data.

Agree, I will move the above code to drivers/firmware/smccc with a new patch.

--- a/drivers/firmware/smccc/smccc.c
+++ b/drivers/firmware/smccc/smccc.c
@@ -17,9 +17,14 @@ static enum arm_smccc_conduit smccc_conduit = SMCCC_CONDUIT_NONE;

bool __ro_after_init smccc_trng_available = false;
u64 __ro_after_init smccc_has_sve_hint = false;
+bool __ro_after_init smccc_has_soc_id;
+s32 __ro_after_init smccc_soc_id_version;
+s32 __ro_after_init smccc_soc_id_revision;

void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
{
+ struct arm_smccc_res res;
+
smccc_version = version;
smccc_conduit = conduit;

@@ -27,6 +32,21 @@ void __init arm_smccc_version_init(u32 version, enum arm_smccc_conduit conduit)
if (IS_ENABLED(CONFIG_ARM64_SVE) &&
smccc_version >= ARM_SMCCC_VERSION_1_3)
smccc_has_sve_hint = true;
+
+ if ((smccc_version >= ARM_SMCCC_VERSION_1_2) &&
+ (smccc_conduit != SMCCC_CONDUIT_NONE)) {
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+ ARM_SMCCC_ARCH_SOC_ID, &res);
+ if ((s32)res.a0 >= 0) {
+ smccc_has_soc_id = true;
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 0, &res);
+ smccc_soc_id_version = (s32)res.a0;
+ arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_SOC_ID, 1, &res);
+ smccc_soc_id_revision = (s32)res.a0;
+ } else if ((s32)res.a0 != SMCCC_RET_NOT_SUPPORTED) {
+ pr_err("ARCH_SOC_ID returned error: %lx\n", res.a0);
+ }
+ }
}

GICv3:

#define T241_CHIPN_MASK GENMASK_ULL(45, 44)
#define T241_CHIP_GICDA_OFFSET 0x1580000
#define SMCCC_SOC_ID_MASK 0x7f7fffff
#define SMCCC_SOC_ID_T241 0x036b0241

static bool gic_enable_quirk_nvidia_t241(void *data)
{
unsigned long chip_bmask = 0;
phys_addr_t phys;
u32 i;

/* Check JEP106 code for NVIDIA T241 chip (036b:0241) */
if (!smccc_has_soc_id || (smccc_soc_id_version < 0) ||
((smccc_soc_id_version & SMCCC_SOC_ID_MASK) != SMCCC_SOC_ID_T241)) {
return false;
}