Re: smp_call_function_single lockups

From: Linus Torvalds
Date: Thu Feb 19 2015 - 16:59:53 EST


On Thu, Feb 19, 2015 at 12:29 PM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> Now, what happens if we send an EOI for an ExtINT interrupt? It
> basically ends up being a spurious IPI. And I *think* that what
> normally happens is absolutely nothing at all. But if in addition to
> the ExtINT, there was a pending IPI (or other pending ISR bit set),
> maybe we lose interrupts..
>
> .. and it's entirely possible that I'm just completely full of shit.
> Who is the poor bastard who has worked most with things like ExtINT,
> and can educate me? I'm adding Ingo, hpa and Jiang Liu as primary
> contacts..

So quite frankly, trying to follow all the logic from do_IRQ() through
handle_irq() to the actual low-level handler, I just couldn't do it.

So instead, I wrote a patch to verify that the ISR bit is actually set
when we do ack_APIC_irq().

This was complicated by the fact that we don't actually pass in the
vector number at all to the acking, so 99% of the patch is just doing
that. A couple of places we don't really have a good vector number, so
I said "screw it, a negative value means that we won't check the ISR).

The attached patch is quite possibly garbage, but it gives an
interesting warning for me during i8042 probing, so who knows. Maybe
it actually shows a real problem - or maybe I just screwed up the
patch.

.. and maybe even if the patch is fine, it's actually never really a
problem to have spurious APIC ACK cycles. Maybe it cannot make
interrupts be ignored.

Anyway, the back-trace for the warning I get is during boot:

...
PNP: No PS/2 controller found. Probing ports directly.
------------[ cut here ]------------
WARNING: CPU: 0 PID: 1 at ./arch/x86/include/asm/apic.h:436
ir_ack_apic_edge+0x74/0x80()
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted
3.19.0-08857-g89d3fa45b4ad-dirty #2
Call Trace:
<IRQ>
dump_stack+0x45/0x57
warn_slowpath_common+0x80/0xc0
warn_slowpath_null+0x15/0x20
ir_ack_apic_edge+0x74/0x80
handle_edge_irq+0x51/0x110
handle_irq+0x74/0x140
do_IRQ+0x4a/0x140
common_interrupt+0x6a/0x6a
<EOI>
? _raw_spin_unlock_irqrestore+0x9/0x10
__setup_irq+0x239/0x5a0
request_threaded_irq+0xc2/0x180
i8042_probe+0x5b8/0x680
platform_drv_probe+0x2f/0xa0
driver_probe_device+0x8b/0x3e0
__driver_attach+0x93/0xa0
bus_for_each_dev+0x63/0xa0
driver_attach+0x19/0x20
bus_add_driver+0x178/0x250
driver_register+0x5f/0xf0
__platform_driver_register+0x45/0x50
__platform_driver_probe+0x26/0xa0
__platform_create_bundle+0xad/0xe0
i8042_init+0x3d0/0x3f6
do_one_initcall+0xb8/0x1d0
kernel_init_freeable+0x16d/0x1fa
kernel_init+0x9/0xf0
ret_from_fork+0x7c/0xb0
---[ end trace 1de82c4457c6a0f0 ]---
serio: i8042 KBD port at 0x60,0x64 irq 1
serio: i8042 AUX port at 0x60,0x64 irq 12
...

and it looks not entirely insane.

Is this worth looking at? Or is it something spurious? I might have
gotten the vectors wrong, and maybe the warning is not because the ISR
bit isn't set, but because I test the wrong bit.

Linus
arch/x86/include/asm/apic.h | 22 ++++++++++++++--------
arch/x86/kernel/apic/apic.c | 10 +++++-----
arch/x86/kernel/apic/io_apic.c | 5 +++--
arch/x86/kernel/apic/vector.c | 8 +++++---
arch/x86/kernel/cpu/mcheck/therm_throt.c | 4 ++--
arch/x86/kernel/cpu/mcheck/threshold.c | 4 ++--
arch/x86/kernel/irq.c | 11 ++++++-----
arch/x86/kernel/irq_work.c | 8 ++++----
arch/x86/kernel/smp.c | 18 +++++++++---------
drivers/iommu/irq_remapping.c | 6 ++++--
10 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 92003f3c8a42..912a969fd747 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -387,7 +387,7 @@ static inline void apic_write(u32 reg, u32 val)
apic->write(reg, val);
}

-static inline void apic_eoi(void)
+static inline void apic_eoi(int vector)
{
apic->eoi_write(APIC_EOI, APIC_EOI_ACK);
}
@@ -418,7 +418,7 @@ extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));

static inline u32 apic_read(u32 reg) { return 0; }
static inline void apic_write(u32 reg, u32 val) { }
-static inline void apic_eoi(void) { }
+static inline void apic_eoi(int vector) { }
static inline u64 apic_icr_read(void) { return 0; }
static inline void apic_icr_write(u32 low, u32 high) { }
static inline void apic_wait_icr_idle(void) { }
@@ -427,13 +427,19 @@ static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}

#endif /* CONFIG_X86_LOCAL_APIC */

-static inline void ack_APIC_irq(void)
+static inline void ack_APIC_irq(int vector)
{
+ /* Is the ISR bit actually set for this vector? */
+ if (vector >= 16) {
+ unsigned v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
+ v >>= vector & 0x1f;
+ WARN_ON_ONCE(!(v & 1));
+ }
/*
* ack_APIC_irq() actually gets compiled as a single instruction
* ... yummie.
*/
- apic_eoi();
+ apic_eoi(vector);
}

static inline unsigned default_get_apic_id(unsigned long x)
@@ -631,9 +637,9 @@ static inline void entering_irq(void)
exit_idle();
}

-static inline void entering_ack_irq(void)
+static inline void entering_ack_irq(int vector)
{
- ack_APIC_irq();
+ ack_APIC_irq(vector);
entering_irq();
}

@@ -642,11 +648,11 @@ static inline void exiting_irq(void)
irq_exit();
}

-static inline void exiting_ack_irq(void)
+static inline void exiting_ack_irq(int vector)
{
irq_exit();
/* Ack only at the end to avoid potential reentry */
- ack_APIC_irq();
+ ack_APIC_irq(vector);
}

extern void ioapic_zap_locks(void);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad3639ae1b9b..20ef1236d057 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -910,7 +910,7 @@ __visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
- entering_ack_irq();
+ entering_ack_irq(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
exiting_irq();

@@ -929,7 +929,7 @@ __visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs)
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
- entering_ack_irq();
+ entering_ack_irq(LOCAL_TIMER_VECTOR);
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);
@@ -1342,7 +1342,7 @@ void setup_local_APIC(void)
value = apic_read(APIC_ISR + i*0x10);
for (j = 31; j >= 0; j--) {
if (value & (1<<j)) {
- ack_APIC_irq();
+ ack_APIC_irq(-1);
acked++;
}
}
@@ -1868,7 +1868,7 @@ static inline void __smp_spurious_interrupt(u8 vector)
*/
v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
if (v & (1 << (vector & 0x1f)))
- ack_APIC_irq();
+ ack_APIC_irq(vector);

inc_irq_stat(irq_spurious_count);

@@ -1917,7 +1917,7 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
v = apic_read(APIC_ESR);
- ack_APIC_irq();
+ ack_APIC_irq(ERROR_APIC_VECTOR);
atomic_inc(&irq_err_count);

apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f4dc2462a1ac..b74db5a2660b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1993,7 +1993,7 @@ static void ack_ioapic_level(struct irq_data *data)
* We must acknowledge the irq before we move it or the acknowledge will
* not propagate properly.
*/
- ack_APIC_irq();
+ ack_APIC_irq(i);

/*
* Tail end of clearing remote IRR bit (either by delivering the EOI
@@ -2067,7 +2067,8 @@ static void unmask_lapic_irq(struct irq_data *data)

static void ack_lapic_irq(struct irq_data *data)
{
- ack_APIC_irq();
+ struct irq_cfg *cfg = irqd_cfg(data);
+ ack_APIC_irq(cfg->vector);
}

static struct irq_chip lapic_chip __read_mostly = {
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 6cedd7914581..30643ffe061c 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -335,9 +335,11 @@ int apic_retrigger_irq(struct irq_data *data)

void apic_ack_edge(struct irq_data *data)
{
- irq_complete_move(irqd_cfg(data));
+ struct irq_cfg *cfg = irqd_cfg(data);
+
+ irq_complete_move(cfg);
irq_move_irq(data);
- ack_APIC_irq();
+ ack_APIC_irq(cfg->vector);
}

/*
@@ -397,7 +399,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;

- ack_APIC_irq();
+ ack_APIC_irq(IRQ_MOVE_CLEANUP_VECTOR);
irq_enter();
exit_idle();

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1af51b1586d7..89bac1f9ed78 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -433,7 +433,7 @@ asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_thermal_interrupt();
- exiting_ack_irq();
+ exiting_ack_irq(THERMAL_APIC_VECTOR);
}

asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
@@ -442,7 +442,7 @@ asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
__smp_thermal_interrupt();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
- exiting_ack_irq();
+ exiting_ack_irq(THERMAL_APIC_VECTOR);
}

/* Thermal monitoring depends on APIC, ACPI and clock modulation */
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 7245980186ee..03068d1844c2 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -28,7 +28,7 @@ asmlinkage __visible void smp_threshold_interrupt(void)
{
entering_irq();
__smp_threshold_interrupt();
- exiting_ack_irq();
+ exiting_ack_irq(THRESHOLD_APIC_VECTOR);
}

asmlinkage __visible void smp_trace_threshold_interrupt(void)
@@ -37,5 +37,5 @@ asmlinkage __visible void smp_trace_threshold_interrupt(void)
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
__smp_threshold_interrupt();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
- exiting_ack_irq();
+ exiting_ack_irq(THRESHOLD_APIC_VECTOR);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 705ef8d48e2d..801e10d3d125 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -45,7 +45,8 @@ void ack_bad_irq(unsigned int irq)
* completely.
* But only ack when the APIC is enabled -AK
*/
- ack_APIC_irq();
+ /* This doesn't know the vector. It needs the irq descriptor */
+ ack_APIC_irq(-1);
}

#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -198,7 +199,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
irq = __this_cpu_read(vector_irq[vector]);

if (!handle_irq(irq, regs)) {
- ack_APIC_irq();
+ ack_APIC_irq(vector);

if (irq != VECTOR_RETRIGGERED) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
@@ -230,7 +231,7 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);

- entering_ack_irq();
+ entering_ack_irq(X86_PLATFORM_IPI_VECTOR);
__smp_x86_platform_ipi();
exiting_irq();
set_irq_regs(old_regs);
@@ -244,7 +245,7 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);

- ack_APIC_irq();
+ ack_APIC_irq(POSTED_INTR_VECTOR);

irq_enter();

@@ -262,7 +263,7 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);

- entering_ack_irq();
+ entering_ack_irq(X86_PLATFORM_IPI_VECTOR);
trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
__smp_x86_platform_ipi();
trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 15d741ddfeeb..dfe92635307c 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -10,10 +10,10 @@
#include <asm/apic.h>
#include <asm/trace/irq_vectors.h>

-static inline void irq_work_entering_irq(void)
+static inline void irq_work_entering_irq(int vector)
{
irq_enter();
- ack_APIC_irq();
+ ack_APIC_irq(vector);
}

static inline void __smp_irq_work_interrupt(void)
@@ -24,14 +24,14 @@ static inline void __smp_irq_work_interrupt(void)

__visible void smp_irq_work_interrupt(struct pt_regs *regs)
{
- irq_work_entering_irq();
+ irq_work_entering_irq(IRQ_WORK_VECTOR);
__smp_irq_work_interrupt();
exiting_irq();
}

__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
{
- irq_work_entering_irq();
+ irq_work_entering_irq(IRQ_WORK_VECTOR);
trace_irq_work_entry(IRQ_WORK_VECTOR);
__smp_irq_work_interrupt();
trace_irq_work_exit(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index be8e1bde07aa..44b74b24c5e4 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -170,7 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)

asmlinkage __visible void smp_reboot_interrupt(void)
{
- ack_APIC_irq();
+ ack_APIC_irq(REBOOT_VECTOR);
irq_enter();
stop_this_cpu(NULL);
irq_exit();
@@ -258,16 +258,16 @@ static inline void __smp_reschedule_interrupt(void)

__visible void smp_reschedule_interrupt(struct pt_regs *regs)
{
- ack_APIC_irq();
+ ack_APIC_irq(RESCHEDULE_VECTOR);
__smp_reschedule_interrupt();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
}

-static inline void smp_entering_irq(void)
+static inline void smp_entering_irq(int vector)
{
- ack_APIC_irq();
+ ack_APIC_irq(vector);
irq_enter();
}

@@ -279,7 +279,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
* scheduler_ipi(). This is OK, since those functions are allowed
* to nest.
*/
- smp_entering_irq();
+ smp_entering_irq(RESCHEDULE_VECTOR);
trace_reschedule_entry(RESCHEDULE_VECTOR);
__smp_reschedule_interrupt();
trace_reschedule_exit(RESCHEDULE_VECTOR);
@@ -297,14 +297,14 @@ static inline void __smp_call_function_interrupt(void)

__visible void smp_call_function_interrupt(struct pt_regs *regs)
{
- smp_entering_irq();
+ smp_entering_irq(CALL_FUNCTION_VECTOR);
__smp_call_function_interrupt();
exiting_irq();
}

__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
{
- smp_entering_irq();
+ smp_entering_irq(CALL_FUNCTION_VECTOR);
trace_call_function_entry(CALL_FUNCTION_VECTOR);
__smp_call_function_interrupt();
trace_call_function_exit(CALL_FUNCTION_VECTOR);
@@ -319,14 +319,14 @@ static inline void __smp_call_function_single_interrupt(void)

__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
{
- smp_entering_irq();
+ smp_entering_irq(CALL_FUNCTION_SINGLE_VECTOR);
__smp_call_function_single_interrupt();
exiting_irq();
}

__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
{
- smp_entering_irq();
+ smp_entering_irq(CALL_FUNCTION_SINGLE_VECTOR);
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
__smp_call_function_single_interrupt();
trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index 390079ee1350..896c4b35422d 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -334,12 +334,14 @@ void panic_if_irq_remap(const char *msg)

static void ir_ack_apic_edge(struct irq_data *data)
{
- ack_APIC_irq();
+ struct irq_cfg *cfg = irqd_cfg(data);
+ ack_APIC_irq(cfg->vector);
}

static void ir_ack_apic_level(struct irq_data *data)
{
- ack_APIC_irq();
+ struct irq_cfg *cfg = irqd_cfg(data);
+ ack_APIC_irq(cfg->vector);
eoi_ioapic_irq(data->irq, irqd_cfg(data));
}