Re: ast: cursor flashing softlockups

From: David Daney
Date: Tue May 17 2016 - 13:39:30 EST


I can confirm this. The cursor is blinking along nicely with a 200mS on/off time then with this patch installed:

diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 6e92917..7855446 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -402,6 +402,8 @@ static void cursor_timer_handler(unsigned long dev_addr)
struct fbcon_ops *ops = info->fbcon_par;

queue_work(system_power_efficient_wq, &info->queue);
+ if (WARN_ON(ops->cur_blink_jiffies < 10))
+ ops->cur_blink_jiffies = 200;
mod_timer(&ops->cursor_timer, jiffies + ops->cur_blink_jiffies);
}

@@ -417,6 +419,8 @@ static void fbcon_add_cursor_timer(struct fb_info *info)

init_timer(&ops->cursor_timer);
ops->cursor_timer.function = cursor_timer_handler;
+ if (WARN_ON(ops->cur_blink_jiffies < 10))
+ ops->cur_blink_jiffies = 200;
ops->cursor_timer.expires = jiffies + ops->cur_blink_jiffies;
ops->cursor_timer.data = (unsigned long ) info;
add_timer(&ops->cursor_timer);
@@ -1096,6 +1100,7 @@ static void fbcon_init(struct vc_data *vc, int init)

ops = info->fbcon_par;
ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms);
+ WARN_ON(ops->cur_blink_jiffies < 10);
p->con_rotate = initial_rotation;
set_blitting_type(vc, info);

@@ -1310,6 +1315,7 @@ static void fbcon_cursor(struct vc_data *vc, int mode)
int c = scr_readw((u16 *) vc->vc_pos);

ops->cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms);
+ WARN_ON(ops->cur_blink_jiffies < 10);

if (fbcon_is_inactive(vc, info) || vc->vc_deccm != 1)
return;

-------------------------------------------------------------

I get:


[ 29.386066] ------------[ cut here ]------------
[ 29.386080] WARNING: CPU: 0 PID: 1688 at drivers/video/console/fbcon.c:1103 fbcon_init+0x47c/0x4b8
[ 29.386145] Modules linked in: vfat(E) fat(E) aes_ce_blk(E) ablk_helper(E) cryptd(E) aes_ce_cipher(E) ghash_ce(E) sha2_ce(E) sha1_ce(E) sg(E) ip_tables(E) xfs(E) libcrc32c(E) nicvf(E) ast(E) i2c_algo_bit(E) drm_kms_helper(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) fb_sys_fops(E) ttm(E) drm(E) i2c_core(E) nicpf(E) thunder_bgx(E) mdio_thunder(E) mdio_cavium(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E)
[ 29.386147]
[ 29.386153] CPU: 0 PID: 1688 Comm: systemd-logind Tainted: G E 4.6.0-rc3-arm64next+ #278
[ 29.386155] Hardware name: www.cavium.com ThunderX CRB-2S/ThunderX CRB-2S, BIOS 0.3 Apr 25 2016
[ 29.386159] task: fffffe0fd1109d80 ti: fffffe0fd119c000 task.ti: fffffe0fd119c000
[ 29.386163] PC is at fbcon_init+0x47c/0x4b8
[ 29.386167] LR is at fbcon_init+0x144/0x4b8
[ 29.386170] pc : [<fffffc0008468930>] lr : [<fffffc00084685f8>] pstate: 80000145
[ 29.386172] sp : fffffe0fd119f900
[ 29.386179] x29: fffffe0fd119f900 x28: fffffe0fdc059c00
[ 29.386186] x27: fffffe001c7c8800 x26: fffffe0fd6027910
[ 29.386192] x25: fffffe0fd6027800 x24: fffffc0009b56f98
[ 29.386198] x23: 0000000000000001 x22: fffffc0008d1a000
[ 29.386205] x21: fffffe0ff4062000 x20: fffffc0009b57018
[ 29.386211] x19: fffffc0009b56000 x18: 000002aae7bd51f0
[ 29.386222] x17: 000003ffb542f9f0 x16: fffffc0008242b80
[ 29.386228] x15: 000002aae7bdb328 x14: 0000000000000006
[ 29.386234] x13: 000002aae7bc72c8 x12: 0000000000000000
[ 29.386240] x11: 0000000000000000 x10: 0000000000000000
[ 29.386247] x9 : 0000000000000000 x8 : fffffe001c7c8b18
[ 29.386253] x7 : 0000000000000000 x6 : 000000000000007f
[ 29.386259] x5 : fffffc0008c91640 x4 : 0000000000000000
[ 29.386266] x3 : 0000000000000800 x2 : fffffe001c7c8ce8
[ 29.386272] x1 : fffffe0fd4df4e00 x0 : 0000000000000000
[ 29.386274]
[ 29.386277] ---[ end trace 6c32ddc01008c9ba ]---
[ 29.386280] Call trace:
[ 29.386284] Exception stack(0xfffffe0fd119f730 to 0xfffffe0fd119f850)
[ 29.386288] f720: fffffc0009b56000 fffffc0009b57018
[ 29.386292] f740: fffffe0fd119f900 fffffc0008468930 0000000080000145 000000000000003d
[ 29.386296] f760: fffffc0008c36ce8 fffffc00081c2a30 fffffe0fd119f7f0 fffffc00080f47d8
[ 29.386299] f780: fffffe0fd119f800 fffffc00080f47d8 000000000000010c fffffe0fd1109d80
[ 29.386303] f7a0: 0000000000000000 000000000025810c fffffe0fd110a5a0 0000000000000000
[ 29.386307] f7c0: fffffc000880d2e8 0000000000000000 fffffe0fd119f850 fffffc00080f47d8
[ 29.386311] f7e0: 0000000000000000 fffffe0fd4df4e00 fffffe001c7c8ce8 0000000000000800
[ 29.386314] f800: 0000000000000000 fffffc0008c91640 000000000000007f 0000000000000000
[ 29.386318] f820: fffffe001c7c8b18 0000000000000000 0000000000000000 0000000000000000
[ 29.386321] f840: 0000000000000000 000002aae7bc72c8
[ 29.386327] [<fffffc0008468930>] fbcon_init+0x47c/0x4b8
[ 29.386332] [<fffffc00084d6e3c>] visual_init+0xbc/0x114
[ 29.386336] [<fffffc00084d9938>] vc_allocate+0x108/0x1e0
[ 29.386340] [<fffffc00084d9a44>] con_install+0x34/0x100
[ 29.386347] [<fffffc00084c2f44>] tty_init_dev+0x74/0x1a8
[ 29.386351] [<fffffc00084c3560>] tty_open+0x4e8/0x5a8
[ 29.386357] [<fffffc0008248a78>] chrdev_open+0xa8/0x1ac
[ 29.386361] [<fffffc0008241288>] do_dentry_open+0x1e8/0x300
[ 29.386364] [<fffffc0008242694>] vfs_open+0x6c/0x7c
[ 29.386370] [<fffffc0008252864>] do_last+0x12c/0xc24
[ 29.386374] [<fffffc00082533d8>] path_openat+0x7c/0x2b8
[ 29.386377] [<fffffc00082546d4>] do_filp_open+0x78/0xd4
[ 29.386381] [<fffffc0008242a74>] do_sys_open+0x150/0x214
[ 29.386384] [<fffffc0008242bbc>] SyS_openat+0x3c/0x48
[ 29.386389] [<fffffc000808538c>] __sys_trace_return+0x0/0x4
.
.
.
[ OK ] Started Command Scheduler.
Starting Command Scheduler...
[ 33.395141] ------------[ cut here ]------------
[ 33.399773] WARNING: CPU: 0 PID: 0 at drivers/video/console/fbcon.c:405 cursor_timer_handler+0x5c/0x6c
[ 33.409072] Modules linked in: ip6t_REJECT(E) nf_reject_ipv6(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_conntrack(E) ebtable_nat(E) ebtable_broute(E) bridge(E) stp(E) llc(E) ebtable_filter(E) ebtables(E) ip6table_nat(E) nf_conntrack_ipv6(E) nf_defrag_ipv6(E) nf_nat_ipv6(E) ip6table_mangle(E) ip6table_security(E) ip6table_raw(E) ip6table_filter(E) ip6_tables(E) iptable_nat(E) nf_conntrack_ipv4(E) nf_defrag_ipv4(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E) iptable_mangle(E) iptable_security(E) iptable_raw(E) iptable_filter(E) vfat(E) fat(E) aes_ce_blk(E) ablk_helper(E) cryptd(E) aes_ce_cipher(E) ghash_ce(E) sha2_ce(E) sha1_ce(E) sg(E) ip_tables(E) xfs(E) libcrc32c(E) nicvf(E) ast(E) i2c_algo_bit(E) drm_kms_helper(E) syscopyarea(E) sysfillrect(E) sysimgblt(E) fb_sys_fops(E) ttm(E) drm(E) i2c_core(E) nicpf(E) thunder_bgx(E) mdio_thunder(E) mdio_cavium(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E)
[ 33.488864]
[ 33.490358] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G W E 4.6.0-rc3-arm64next+ #278
[ 33.498877] Hardware name: www.cavium.com ThunderX CRB-2S/ThunderX CRB-2S, BIOS 0.3 Apr 25 2016
[ 33.507571] task: fffffc0008c79c80 ti: fffffc0008c50000 task.ti: fffffc0008c50000
[ 33.515053] PC is at cursor_timer_handler+0x5c/0x6c
[ 33.519930] LR is at cursor_timer_handler+0x30/0x6c
[ 33.524804] pc : [<fffffc0008467490>] lr : [<fffffc0008467464>] pstate: 80000145
[ 33.532198] sp : ffffff0ff9dc3da0
[ 33.535513] x29: ffffff0ff9dc3da0 x28: fffffc0008c70be8
[ 33.540836] x27: fffffc0008c70000 x26: fffffc0008d68fb3
[ 33.546158] x25: fffffc0008c70000 x24: fffffc0008d6a000
[ 33.551483] x23: fffffe0fd6027800 x22: fffffc0008467434
[ 33.556809] x21: 0000000000000101 x20: fffffe0fd6027800
[ 33.562138] x19: fffffe0fdc059c00 x18: 000002aadcf70050
[ 33.567464] x17: 000003ff95cfa228 x16: fffffc000829628c
[ 33.572786] x15: 00000000000000dd x14: ffffffc2c3bfa000
[ 33.578114] x13: 003d093352dc0000 x12: 00000000003d0900
[ 33.583440] x11: 0000000000000000 x10: 00003d09334ec400
[ 33.588765] x9 : fffffc0008c50000 x8 : fffffc0008851320
[ 33.594090] x7 : 00000000ad583ee9 x6 : 0000000000000001
[ 33.599412] x5 : fffffc0008c79c80 x4 : fffffc0009012c28
[ 33.604743] x3 : 0000000000000004 x2 : 0000000000000002
[ 33.610074] x1 : 0000000000000000 x0 : 0000000000000001
[ 33.615395]
[ 33.616886] ---[ end trace 6c32ddc01008c9c3 ]---
[ 33.621506] Call trace:
[ 33.623951] Exception stack(0xffffff0ff9dc3bd0 to 0xffffff0ff9dc3cf0)
[ 33.630390] 3bc0: fffffe0fdc059c00 fffffe0fd6027800
[ 33.638221] 3be0: ffffff0ff9dc3da0 fffffc0008467490 0000000080000145 000000000000003d
[ 33.646050] 3c00: fffffe0fd46b5700 ffffff0ff9dd8f00 fffffc0008d69de0 0000000000000000
[ 33.653882] 3c20: fffffc0008c36ce8 fffffc0008c24b28 fffffc0008c70be8 fffffc0008da5b48
[ 33.661708] 3c40: ffffff0ff9dc3cb0 fffffc0008814614 ffffff0ff9dd8f00 fffffc0008d68ea4
[ 33.669535] 3c60: fffffc0008d69de0 0000000000000000 fffffc0008c36ce8 fffffc0008c24b28
[ 33.677361] 3c80: 0000000000000001 0000000000000000 0000000000000002 0000000000000004
[ 33.685191] 3ca0: fffffc0009012c28 fffffc0008c79c80 0000000000000001 00000000ad583ee9
[ 33.693017] 3cc0: fffffc0008851320 fffffc0008c50000 00003d09334ec400 0000000000000000
[ 33.700845] 3ce0: 00000000003d0900 003d093352dc0000
[ 33.705726] [<fffffc0008467490>] cursor_timer_handler+0x5c/0x6c
[ 33.711648] [<fffffc0008117bd8>] call_timer_fn+0xc0/0x3e4
[ 33.717047] [<fffffc0008118114>] run_timer_softirq+0x1ec/0x39c
[ 33.722879] [<fffffc0008081794>] __do_softirq+0x104/0x598
[ 33.728278] [<fffffc00080a2ca4>] irq_exit+0x9c/0xc0
[ 33.733155] [<fffffc0008102384>] __handle_domain_irq+0x98/0xfc
[ 33.739009] [<fffffc0008081594>] gic_handle_irq+0x94/0x190
[ 33.744492] Exception stack(0xffffff0ff9dc3fc0 to 0xffffff0ff9dc40e0)
[ 33.750930] 3fc0: fffffc0008c53da0 fffffc0008d6a000 fffffc0008c53ec0 fffffc00080860c0
[ 33.758756] 3fe0: 0000000060000145 fffffc0008a33c98 ffffff0ff9dc0020 0000000000000000
[ 33.766582] 4000: fffffc0008c53ec0 fffffc0008c53da0 0000000000000000 0000000000000000
[ 33.774408] 4020: fffffe0fee5412d0 0000000000000000 0000000000000000 0000000000000000
[ 33.782234] 4040: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 33.790060] 4060: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 33.797887] 4080: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 33.805715] 40a0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 33.813547] 40c0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[ 33.821374] [<fffffc0008084de4>] el1_irq+0xa4/0x114
[ 33.826251] [<fffffc00080860c0>] arch_cpu_idle+0x40/0x1fc
[ 33.831651] [<fffffc00080e9970>] default_idle_call+0x20/0x34
[ 33.837310] [<fffffc00080e9ae4>] cpu_startup_entry+0x160/0x314
[ 33.843142] [<fffffc000880d968>] rest_init+0x150/0x160
[ 33.848283] [<fffffc0008b70b90>] start_kernel+0x3b8/0x3cc
[ 33.853681] [<fffffc0008b701c4>] __primary_switched+0x30/0x6c





On 05/17/2016 07:29 AM, Peter Hurley wrote:
[ +to Scot Doyle ]

Scot, please take a look at this soft lockup.

Regards,
Peter Hurley


Hi Ming,

On 05/17/2016 02:12 AM, Ming Lei wrote:
Hi,

On Tue, May 17, 2016 at 4:07 AM, Dann Frazier
<dann.frazier@xxxxxxxxxxxxx> wrote:
Hi,
I'm observing a soft lockup issue w/ the ASPEED controller on an
arm64 server platform. This was originally seen on Ubuntu's 4.4
kernel, but it is reproducible w/ vanilla 4.6-rc7 as well.

[ 32.792656] NMI watchdog: BUG: soft lockup - CPU#38 stuck for 22s!
[swapper/38:0]

I observe this just once each time I boot into debian-installer (I'm
using a serial console, but the ast module gets loaded during
startup).

I have figured out that it is caused by 'mod_timer(timer, jiffies)' and
'ops->cur_blink_jiffies' is observed as zero in cursor_timer_handler()
when the issue happened.

Thanks for tracking this down.

This softlockup looks to be caused by:

commit 27a4c827c34ac4256a190cc9d24607f953c1c459
Author: Scot Doyle <lkml14@xxxxxxxxxxxxx>
Date: Thu Mar 26 13:56:38 2015 +0000

fbcon: use the cursor blink interval provided by vt

vt now provides a cursor blink interval via vc_data. Use this
interval instead of the currently hardcoded 200 msecs. Store it in
fbcon_ops to avoid locking the console in cursor_timer_handler().

Signed-off-by: Scot Doyle <lkml14@xxxxxxxxxxxxx>
Acked-by: Pavel Machek <pavel@xxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>

and

commit bd63364caa8df38bad2b25b11b2a1b849475cce5
Author: Scot Doyle <lkml14@xxxxxxxxxxxxx>
Date: Thu Mar 26 13:54:39 2015 +0000

vt: add cursor blink interval escape sequence

Add an escape sequence to specify the current console's cursor blink
interval. The interval is specified as a number of milliseconds until
the next cursor display state toggle, from 50 to 65535. /proc/loadavg
did not show a difference with a one msec interval, but the lower
bound is set to 50 msecs since slower hardware wasn't tested.

Store the interval in the vc_data structure for later access by fbcon,
initializing the value to fbcon's current hardcoded value of 200 msecs.

Signed-off-by: Scot Doyle <lkml14@xxxxxxxxxxxxx>
Acked-by: Pavel Machek <pavel@xxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>



Looks it is a real fbcon/vt issue, see following:

fbcon_init()
<-.con_init
<-visual_init()

reset_terminal()
<-vc_init()

vc->vc_cur_blink_ms is just set in reset_terminal() from vc_init() path,
and ops->cur_blink_jiffies is figured out from vc->vc_cur_blink_ms
in fbcon_init().

And visual_init() is always run before vc_init(), so ops->cur_blink_jiffies
is initialized as zero and cause the soft lockup issue finally.

Thanks,
Ming


perf shows that the CPU caught by the NMI is typically in code
updating the cursor timer:

- 16.92% swapper [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore
- _raw_spin_unlock_irqrestore
+ 16.87% mod_timer
+ 0.05% cursor_timer_handler
- 12.15% swapper [kernel.kallsyms] [k] queue_work_on
- queue_work_on
+ 12.00% cursor_timer_handler
+ 0.15% call_timer_fn
+ 10.98% swapper [kernel.kallsyms] [k] run_timer_softirq
- 2.23% swapper [kernel.kallsyms] [k] mod_timer
- mod_timer
+ 1.97% cursor_timer_handler
+ 0.26% call_timer_fn

During the same period, I can see that another CPU is actively
executing the timer function:

- 42.18% kworker/u96:2 [kernel.kallsyms] [k] ww_mutex_unlock
- ww_mutex_unlock
- 40.70% ast_dirty_update
ast_imageblit
soft_cursor
bit_cursor
fb_flashcursor
process_one_work
worker_thread
kthread
ret_from_fork
+ 1.48% ast_imageblit
- 40.15% kworker/u96:2 [kernel.kallsyms] [k] __memcpy_toio
- __memcpy_toio
+ 31.54% ast_dirty_update
+ 8.61% ast_imageblit

Using the graph function tracer on fb_flashcursor(), I see that
ast_dirty_update usually takes around 60 us, in which it makes 16
calls to __memcpy_toio(). However, there is always one instance on
every boot of the installer where ast_dirty_update() takes ~98 *ms* to
complete, during which it makes 743 calls to __memcpy_toio(). While
that doesn't directly account for the full 22s, I do wonder if that
maybe a smoking gun.

fyi, this is being tracked at: https://bugs.launchpad.net/bugs/1574814

-dann