Re: BUG: soft lockup in smp_call_function

From: Tetsuo Handa
Date: Thu Sep 14 2023 - 09:14:31 EST


On 2023/09/14 21:21, Hillf Danton wrote:
> On Wed, 13 Sep 2023 23:30:23 +0900 Tetsuo Handa wrote:
>> On 2023/09/13 20:07, Hillf Danton wrote:
>>>
>>> cpu1 cpu4 (see below)
>>> ==== ====
>>> drm_crtc_vblank_off __run_hrtimer
>>> spin_lock_irq(&dev->event_lock);
>>> ...
>>> drm_handle_vblank
>>> hrtimer_cancel spin_lock_irqsave(&dev->event_lock, irqflags);
>>>
>>>
>>> Deadlock should have been reported instead provided the lockdep_map in
>>> struct timer_list were added also to hrtimer, so it is highly appreciated
>>> if Tetsuo or Thomas adds it before 6.8 or 6.10.
>>
>> Not me. ;-)
>>
>> Since hrtimer_cancel() retries forever until lock_hrtimer_base() succeeds,
>> we want to add a lockdep annotation into hrtimer_cancel() so that we can
>> detect this type of deadlock?

Here is a reproducer.

----------------------------------------
#include <linux/module.h>
#include <linux/delay.h>
static DEFINE_SPINLOCK(lock1);
static struct hrtimer timer1;
static enum hrtimer_restart timer_func(struct hrtimer *timer)
{
unsigned long flags;
mdelay(100); // Wait for test_init() to hold lock1.
spin_lock_irqsave(&lock1, flags);
spin_unlock_irqrestore(&lock1, flags);
return HRTIMER_RESTART;
}
static int __init test_init(void)
{
unsigned long flags;
hrtimer_init(&timer1, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
timer1.function = &timer_func;
hrtimer_start(&timer1, 1, HRTIMER_MODE_REL);
mdelay(10); // Wait for timer_func() to start.
spin_lock_irqsave(&lock1, flags);
hrtimer_cancel(&timer1); // Wait for timer_func() to finish.
spin_unlock_irqrestore(&lock1, flags);
return -EINVAL;
}
module_init(test_init);
MODULE_LICENSE("GPL");
----------------------------------------

----------------------------------------
[ 996.507681] test: loading out-of-tree module taints kernel.
[ 996.514019] test: module verification failed: signature and/or required key missing - tainting kernel
[ 1061.893054] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[ 1061.900411] rcu: 4-...0: (1 GPs behind) idle=ed6c/1/0x4000000000000000 softirq=3304/3305 fqs=15784
[ 1061.909128] rcu: (detected by 0, t=65018 jiffies, g=12625, q=4422 ncpus=12)
[ 1061.915003] Sending NMI from CPU 0 to CPUs 4:
[ 1061.918972] NMI backtrace for cpu 4
[ 1061.919036] CPU: 4 PID: 3826 Comm: insmod Tainted: G OE 6.6.0-rc1+ #20
[ 1061.919093] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020
[ 1061.919095] RIP: 0010:delay_tsc+0x34/0xa0
[ 1061.919560] Code: ff 05 e8 b1 26 70 65 44 8b 0d e4 b1 26 70 0f 01 f9 66 90 48 c1 e2 20 48 09 c2 49 89 d0 eb 21 65 ff 0d c8 b1 26 70 74 54 f3 90 <65> ff 05 bd b1 26 70 65 8b 35 ba b1 26 70 41 39 f1 75 28 41 89 f1
[ 1061.919563] RSP: 0018:ffffb471c059cf00 EFLAGS: 00000083
[ 1061.919567] RAX: 0000028efe104ef6 RBX: 0000000000000041 RCX: 0000000000000004
[ 1061.919569] RDX: 00000000002192f8 RSI: 0000000000000004 RDI: 000000000027d81e
[ 1061.919571] RBP: ffff8970dafe5040 R08: 0000028efdeebbfe R09: 0000000000000004
[ 1061.919572] R10: 0000000000000001 R11: ffffffffc0a8d600 R12: ffffffff90e030e0
[ 1061.919574] R13: ffff8970dafe5040 R14: ffffffffc0a8b010 R15: ffff8970dafe5100
[ 1061.919630] FS: 00007fdd998eb740(0000) GS:ffff8970dae00000(0000) knlGS:0000000000000000
[ 1061.919633] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1061.919635] CR2: 0000000001edf678 CR3: 00000001a1060000 CR4: 0000000000350ee0
[ 1061.919639] Call Trace:
[ 1061.919641] <NMI>
[ 1061.919646] ? nmi_cpu_backtrace+0xb1/0x130
[ 1061.919711] ? nmi_cpu_backtrace_handler+0x11/0x20
[ 1061.922096] ? nmi_handle+0xe4/0x290
[ 1061.922163] ? default_do_nmi+0x49/0x100
[ 1061.922196] ? exc_nmi+0x152/0x1e0
[ 1061.922198] ? end_repeat_nmi+0x16/0x67
[ 1061.922340] ? __pfx_timer_func+0x10/0x10 [test]
[ 1061.922347] ? delay_tsc+0x34/0xa0
[ 1061.922349] ? delay_tsc+0x34/0xa0
[ 1061.922350] ? delay_tsc+0x34/0xa0
[ 1061.922352] </NMI>
[ 1061.922353] <IRQ>
[ 1061.922353] timer_func+0x19/0xff0 [test]
[ 1061.922358] __hrtimer_run_queues+0x177/0x3a0
[ 1061.922362] hrtimer_interrupt+0x104/0x240
[ 1061.922364] ? __do_softirq+0x2db/0x392
[ 1061.922827] __sysvec_apic_timer_interrupt+0x64/0x180
[ 1061.922833] sysvec_apic_timer_interrupt+0x65/0x80
[ 1061.922894] </IRQ>
[ 1061.922896] <TASK>
[ 1061.922898] asm_sysvec_apic_timer_interrupt+0x1a/0x20
[ 1061.922902] RIP: 0010:delay_tsc+0x4d/0xa0
[ 1061.922907] Code: c2 49 89 d0 eb 21 65 ff 0d c8 b1 26 70 74 54 f3 90 65 ff 05 bd b1 26 70 65 8b 35 ba b1 26 70 41 39 f1 75 28 41 89 f1 0f 01 f9 <66> 90 48 c1 e2 20 48 09 d0 48 89 c2 4c 29 c2 48 39 fa 72 c8 65 ff
[ 1061.922909] RSP: 0018:ffffb471c1e63bd0 EFLAGS: 00000246
[ 1061.922912] RAX: 00000000751ed8ab RBX: 000000000000000a RCX: 0000000000000004
[ 1061.922914] RDX: 0000000000000267 RSI: 0000000000000004 RDI: 000000000027d81e
[ 1061.922915] RBP: ffffffffc0a91010 R08: 00000267751adc59 R09: 0000000000000004
[ 1061.922917] R10: 0000000000000001 R11: ffffffff90cd85c8 R12: 0000000000000000
[ 1061.922918] R13: ffffb471c1e63d20 R14: 0000000000000000 R15: ffffffffc0a8d080
[ 1061.922923] ? __pfx_test_init+0x10/0x10 [test]
[ 1061.922934] test_init+0x52/0xff0 [test]
[ 1061.922941] do_one_initcall+0x5c/0x280
[ 1061.923004] ? kmalloc_trace+0xa9/0xc0
[ 1061.923105] do_init_module+0x60/0x240
[ 1061.923111] load_module+0x1f6e/0x20d0
[ 1061.923119] ? ima_post_read_file+0xe3/0xf0
[ 1061.923225] ? init_module_from_file+0x88/0xc0
[ 1061.923229] init_module_from_file+0x88/0xc0
[ 1061.923238] idempotent_init_module+0x19c/0x250
[ 1061.923244] ? security_capable+0x39/0x60
[ 1061.923304] __x64_sys_finit_module+0x5b/0xb0
[ 1061.923310] do_syscall_64+0x3b/0x90
[ 1061.923366] entry_SYSCALL_64_after_hwframe+0x6e/0xd8
[ 1061.923421] RIP: 0033:0x7fdd986f8e29
[ 1061.923427] Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 17 e0 2c 00 f7 d8 64 89 01 48
[ 1061.923429] RSP: 002b:00007ffe2f34dd18 EFLAGS: 00000206 ORIG_RAX: 0000000000000139
[ 1061.923432] RAX: ffffffffffffffda RBX: 0000000001ede240 RCX: 00007fdd986f8e29
[ 1061.923434] RDX: 0000000000000000 RSI: 000000000041a96e RDI: 0000000000000003
[ 1061.923435] RBP: 000000000041a96e R08: 0000000000000000 R09: 00007ffe2f34deb8
[ 1061.923436] R10: 0000000000000003 R11: 0000000000000206 R12: 0000000000000000
[ 1061.923437] R13: 0000000001ede210 R14: 0000000000000000 R15: 0000000000000000
[ 1061.923444] </TASK>
[ 1061.923446] INFO: NMI handler (nmi_cpu_backtrace_handler) took too long to run: 4.474 msecs
----------------------------------------

>
> Yes, you are right.
>
> The diff below is my two cents (only for thoughts).
>

I'm thinking something like below. (Completely untested.)

I haven't checked IRQ state handling. But in the last diff chunk, why raw_spin_unlock_irqrestore()
(which does not re-enable IRQs if the caller already disabled IRQs) is used before calling the callback
function and raw_spin_lock_irq() (which always disables IRQs) is used after calling the callback
function? Is it legal to disable IRQs again when the caller already disabled IRQs?

----------------------------------------
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f10..5640730ec31c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -123,8 +123,11 @@ struct hrtimer {
u8 state;
u8 is_rel;
u8 is_soft;
u8 is_hard;
+#ifdef CONFIG_LOCKDEP
+ struct lockdep_map lockdep_map;
+#endif
};

/**
* struct hrtimer_sleeper - simple sleeper structure
@@ -440,15 +443,15 @@ static inline void hrtimer_restart(struct hrtimer *timer)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}

/* Query timers: */
-extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
+extern ktime_t __hrtimer_get_remaining(struct hrtimer *timer, bool adjust);

/**
* hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
*/
-static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+static inline ktime_t hrtimer_get_remaining(struct hrtimer *timer)
{
return __hrtimer_get_remaining(timer, false);
}

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 238262e4aba7..fe0681d34b56 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -161,14 +161,23 @@ static inline bool is_migration_base(struct hrtimer_clock_base *base)
* possible to set timer->base = &migration_base and drop the lock: the timer
* remains locked.
*/
static
-struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+struct hrtimer_clock_base *lock_hrtimer_base(struct hrtimer *timer,
unsigned long *flags)
__acquires(&timer->base->lock)
{
struct hrtimer_clock_base *base;

+#ifdef CONFIG_LOCKDEP
+ unsigned long flags2;
+
+ local_irq_save(flags2);
+ lock_map_acquire(&timer->lockdep_map);
+ lock_map_release(&timer->lockdep_map);
+ local_irq_restore(flags2);
+#endif
+
for (;;) {
base = READ_ONCE(timer->base);
if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
@@ -1456,9 +1465,9 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
* __hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
* @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
*/
-ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
+ktime_t __hrtimer_get_remaining(struct hrtimer *timer, bool adjust)
{
unsigned long flags;
ktime_t rem;

@@ -1574,8 +1583,14 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
timer->is_soft = softtimer;
timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
+#ifdef CONFIG_LOCKDEP
+ {
+ static struct lock_class_key __key;
+ lockdep_init_map(&timer->lockdep_map, "hrtimer", &__key, 0);
+ }
+#endif
}

/**
* hrtimer_init - initialize a timer to the given clock
@@ -1684,9 +1699,19 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
trace_hrtimer_expire_entry(timer, now);
expires_in_hardirq = lockdep_hrtimer_enter(timer);

+#ifdef CONFIG_LOCKDEP
+ local_irq_save(flags);
+ lock_map_acquire(&timer->lockdep_map);
+ local_irq_restore(flags);
+#endif
restart = fn(timer);
+#ifdef CONFIG_LOCKDEP
+ local_irq_save(flags);
+ lock_map_release(&timer->lockdep_map);
+ local_irq_restore(flags);
+#endif

lockdep_hrtimer_exit(expires_in_hardirq);
trace_hrtimer_expire_exit(timer);
raw_spin_lock_irq(&cpu_base->lock);
----------------------------------------