Re: [PATCH] x86/apic: Add retry mechanism to add_pin_to_irq_node()

From: Thomas Gleixner
Date: Mon Jul 29 2024 - 13:44:20 EST


Breno!

On Mon, Jul 29 2024 at 09:55, Breno Leitao wrote:
>> > panic
>> > _printk
>> > panic_smp_self_stop
>> > rcu_is_watching
>> > intel_irq_remapping_free
>>
>> This completely lacks context. When does this happen? What's the system
>> state? What has intel_irq_remapping_free() to do with the allocation path?
>
> Sorry, let me clarify it a bit better:
>
> 1) This happens when the machine is booted up, and being under stress
> 2) This happens when I have failslab fault injection enabled.
> 3) The machine crashes after hitting this error.
> 4) This is reproducible with `stress-ng` using the `--aggressive` parameter
> 5) This is the full stack (sorry for not decoding the stack, but if you
> need it, I am more than happy to give you a decoded stack)

Ok. That makes sense.

>> While it seems to make sense, the reality is that this is mostly early
>> boot code. If there is a real world memory allocation failure during
>> early boot then retries will not help at all.
>
> This is not happening at early boot, this is reproducible when running
> stress-ng in this aggressive mode.
>
> Since I have failslab injecting a kmalloc fault,
> __add_pin_to_irq_noder() returns -ENOMEM, which causes the undesired
> panic().

Fine. During runtime that allocation fail should not be fatal. It just
needs to be properly propagated.

>> > Introduce a retry mechanism that attempts to add the pin up to 3 times
>> > before giving up and panicking. This should improve the robustness of
>> > the IO-APIC code in the face of transient errors.
>>
>> I'm absolutely not convinced by this loop heuristic. That's just a bad
>> hack.
>
> I will not disagree with you here, but I need to use this patch in order
> to be able t keep the system not panicking and stable while fault
> injecting slab errors and trying to reproduce a real bug in the network
> stack.

Something like the untested below should just work.

Thanks,

tglx
---
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -352,27 +352,26 @@ static void ioapic_mask_entry(int apic,
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int __add_pin_to_irq_node(struct mp_chip_data *data,
- int node, int apic, int pin)
+static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin)
{
struct irq_pin_list *entry;

/* don't allow duplicates */
- for_each_irq_pin(entry, data->irq_2_pin)
+ for_each_irq_pin(entry, data->irq_2_pin) {
if (entry->apic == apic && entry->pin == pin)
- return 0;
+ return true;
+ }

entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
if (!entry) {
- pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
- node, apic, pin);
- return -ENOMEM;
+ pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", node, apic, pin);
+ return false;
}
+
entry->apic = apic;
entry->pin = pin;
list_add_tail(&entry->list, &data->irq_2_pin);
-
- return 0;
+ return true;
}

static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
@@ -387,35 +386,6 @@ static void __remove_pin_from_irq(struct
}
}

-static void add_pin_to_irq_node(struct mp_chip_data *data,
- int node, int apic, int pin)
-{
- if (__add_pin_to_irq_node(data, node, apic, pin))
- panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
- int oldapic, int oldpin,
- int newapic, int newpin)
-{
- struct irq_pin_list *entry;
-
- for_each_irq_pin(entry, data->irq_2_pin) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- /* every one is different, right? */
- return;
- }
- }
-
- /* old apic/pin didn't exist, so just add new ones */
- add_pin_to_irq_node(data, node, newapic, newpin);
-}
-
static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
void (*final)(struct irq_pin_list *entry))
{
@@ -1002,8 +972,7 @@ static int alloc_isa_irq_from_domain(str
if (irq_data && irq_data->parent_data) {
if (!mp_check_pin_attr(irq, info))
return -EBUSY;
- if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
- info->ioapic.pin))
+ if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin))
return -ENOMEM;
} else {
info->flags |= X86_IRQ_ALLOC_LEGACY;
@@ -2131,10 +2100,10 @@ static int __init disable_timer_pin_setu
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);

-static int mp_alloc_timer_irq(int ioapic, int pin)
+static int __init mp_alloc_timer_irq(int ioapic, int pin)
{
- int irq = -1;
struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ int irq = -1;

if (domain) {
struct irq_alloc_info info;
@@ -2150,6 +2119,24 @@ static int mp_alloc_timer_irq(int ioapic
return irq;
}

+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ return;
+ }
+ }
+
+ /* Old apic/pin didn't exist, so just add a new one */
+ add_pin_to_irq_node(data, node, newapic, newpin);
+}
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
@@ -2996,9 +2983,9 @@ int mp_irqdomain_alloc(struct irq_domain
unsigned int nr_irqs, void *arg)
{
struct irq_alloc_info *info = arg;
+ int ret = -ENOMEM, ioapic, pin;
struct mp_chip_data *data;
struct irq_data *irq_data;
- int ret, ioapic, pin;
unsigned long flags;

if (!info || nr_irqs > 1)
@@ -3016,22 +3003,21 @@ int mp_irqdomain_alloc(struct irq_domain
if (!data)
return -ENOMEM;

- ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
- if (ret < 0) {
- kfree(data);
- return ret;
- }
-
INIT_LIST_HEAD(&data->irq_2_pin);
irq_data->hwirq = info->ioapic.pin;
irq_data->chip = (domain->parent == x86_vector_domain) ?
&ioapic_chip : &ioapic_ir_chip;
irq_data->chip_data = data;
mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
+ mp_preconfigure_entry(data);

- add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+ if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin))
+ goto out_data;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+ if (ret < 0)
+ goto out_pin;

- mp_preconfigure_entry(data);
mp_register_handler(virq, data->is_level);

local_irq_save(flags);
@@ -3044,6 +3030,12 @@ int mp_irqdomain_alloc(struct irq_domain
ioapic, mpc_ioapic_id(ioapic), pin, virq,
data->is_level, data->active_low);
return 0;
+
+out_pin:
+ __remove_pin_from_irq(data, ioapic, (int)irq_data->hwirq);
+out_data:
+ kfree(data);
+ return ret;
}

void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,