[PATCH] mem_cgroup: event_list_lock requires irqsave lock
From: Debabrata Banerjee
Date: Mon May 13 2019 - 16:47:26 EST
Lockdep reports of potential deadlock in memcg_event_wake():
[ 850.145324] =====================================================
[ 850.151458] WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected
[ 850.158108] 4.19.29-4.19.0-debug-99d9c44b25c08f51 #1 Tainted: G O
[ 850.165540] -----------------------------------------------------
[ 850.171669] gh_PhantomThr00/8426 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
[ 850.178924] 00000000cf6f8a05 (&(&memcg->event_list_lock)->rlock){+.+.}, at: memcg_event_wake+0x58/0x210
[ 850.188360]
[ 850.188360] and this task is already holding:
[ 850.194226] 00000000bc034eb9 (&ctx->wqh#2){..-.}, at: __wake_up_common_lock+0xa3/0x100
[ 850.202183] which would create a new lock dependency:
[ 850.207279] (&ctx->wqh#2){..-.} -> (&(&memcg->event_list_lock)->rlock){+.+.}
[ 850.214454]
[ 850.214454] but this new dependency connects a SOFTIRQ-irq-safe lock:
[ 850.222403] (&ctx->wqh#2){..-.}
[ 850.222405]
[ 850.222405] ... which became SOFTIRQ-irq-safe at:
[ 850.231894] _raw_spin_lock_irqsave+0x48/0x80
[ 850.236385] eventfd_signal+0x1f/0xc0
[ 850.240169] aio_complete+0x51b/0xd40
[ 850.243959] dio_complete+0x2e3/0x880
[ 850.247970] blk_update_request+0x197/0xb50
[ 850.252277] scsi_end_request+0x77/0x870
[ 850.256325] scsi_io_completion+0x211/0x14e0
[ 850.260720] blk_done_softirq+0x212/0x310
[ 850.264863] __do_softirq+0x22e/0x868
[ 850.268657] irq_exit+0x150/0x170
[ 850.272098] do_IRQ+0x87/0x1a0
[ 850.275277] ret_from_intr+0x0/0x22
[ 850.278893] orc_find+0x9a/0x340
[ 850.282246] unwind_next_frame+0x1fd/0x1850
[ 850.286554] __save_stack_trace+0x73/0xd0
[ 850.290690] kasan_kmalloc+0xda/0x170
[ 850.294474] kmem_cache_alloc+0x14e/0x340
[ 850.298609] create_object+0x81/0x8e0
[ 850.302396] kmem_cache_alloc+0x2c8/0x340
[ 850.306529] __blockdev_direct_IO+0x36c/0xae51
[ 850.311121] ext4_direct_IO+0xecd/0x1690 [ext4]
[ 850.315779] generic_file_read_iter+0x1de/0x15b0
[ 850.320516] aio_read+0x2a7/0x360
[ 850.323957] io_submit_one+0x5a6/0x1710
[ 850.327917] __se_sys_io_submit+0x115/0x340
[ 850.332226] do_syscall_64+0x9b/0x400
[ 850.336014] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 850.341187]
[ 850.341187] to a SOFTIRQ-irq-unsafe lock:
[ 850.346714] (&(&memcg->event_list_lock)->rlock){+.+.}
[ 850.346717]
[ 850.346717] ... which became SOFTIRQ-irq-unsafe at:
[ 850.358283] ...
[ 850.358286] _raw_spin_lock+0x30/0x70
[ 850.363867] memcg_write_event_control+0x982/0xe60
[ 850.368782] cgroup_file_write+0x260/0x640
[ 850.373002] kernfs_fop_write+0x278/0x400
[ 850.377136] __vfs_write+0xd5/0x5b0
[ 850.380748] vfs_write+0x15d/0x460
[ 850.384275] ksys_write+0xb1/0x170
[ 850.387803] do_syscall_64+0x9b/0x400
[ 850.391591] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 850.396762]
[ 850.396762] other info that might help us debug this:
[ 850.396762]
[ 850.404798] Possible interrupt unsafe locking scenario:
[ 850.404798]
[ 850.411626] CPU0 CPU1
[ 850.416195] ---- ----
[ 850.420760] lock(&(&memcg->event_list_lock)->rlock);
[ 850.426343] local_irq_disable();
[ 850.432297] lock(&ctx->wqh#2);
[ 850.438087] lock(&(&memcg->event_list_lock)->rlock);
[ 850.445781] <Interrupt>
[ 850.448442] lock(&ctx->wqh#2);
[ 850.451884]
[ 850.451884] *** DEADLOCK ***
[ 850.451884]
[ 850.457847] 1 lock held by gh_PhantomThr00/8426:
[ 850.462499] #0: 00000000bc034eb9 (&ctx->wqh#2){..-.}, at: __wake_up_common_lock+0xa3/0x100
[ 850.470889]
[ 850.470889] the dependencies between SOFTIRQ-irq-safe lock and the holding lock:
[ 850.479801] -> (&ctx->wqh#2){..-.} ops: 2456971 {
[ 850.484546] IN-SOFTIRQ-W at:
[ 850.487731] _raw_spin_lock_irqsave+0x48/0x80
[ 850.493779] eventfd_signal+0x1f/0xc0
[ 850.499134] aio_complete+0x51b/0xd40
[ 850.504481] dio_complete+0x2e3/0x880
[ 850.509828] blk_update_request+0x197/0xb50
[ 850.515696] scsi_end_request+0x77/0x870
[ 850.521312] scsi_io_completion+0x211/0x14e0
[ 850.527265] blk_done_softirq+0x212/0x310
[ 850.532959] __do_softirq+0x22e/0x868
[ 850.538307] irq_exit+0x150/0x170
[ 850.543307] do_IRQ+0x87/0x1a0
[ 850.548046] ret_from_intr+0x0/0x22
[ 850.553224] orc_find+0x9a/0x340
[ 850.558136] unwind_next_frame+0x1fd/0x1850
[ 850.564002] __save_stack_trace+0x73/0xd0
[ 850.569700] kasan_kmalloc+0xda/0x170
[ 850.575054] kmem_cache_alloc+0x14e/0x340
[ 850.580747] create_object+0x81/0x8e0
[ 850.586103] kmem_cache_alloc+0x2c8/0x340
[ 850.591797] __blockdev_direct_IO+0x36c/0xae51
[ 850.597944] ext4_direct_IO+0xecd/0x1690 [ext4]
[ 850.604162] generic_file_read_iter+0x1de/0x15b0
[ 850.610474] aio_read+0x2a7/0x360
[ 850.615472] io_submit_one+0x5a6/0x1710
[ 850.620992] __se_sys_io_submit+0x115/0x340
[ 850.626862] do_syscall_64+0x9b/0x400
[ 850.632206] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 850.638943] INITIAL USE at:
[ 850.642037] _raw_spin_lock_irqsave+0x48/0x80
[ 850.647989] add_wait_queue+0x49/0x150
[ 850.653338] ep_ptable_queue_proc+0x296/0x380
[ 850.659290] eventfd_poll+0x6f/0x100
[ 850.664464] ep_item_poll.isra.1+0xf9/0x320
[ 850.670246] __se_sys_epoll_ctl+0x12e0/0x3030
[ 850.676200] do_syscall_64+0x9b/0x400
[ 850.681469] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 850.688115] }
[ 850.689825] ... key at: [<ffffffffbf2ac7e0>] __key.41123+0x0/0x40
[ 850.696479] ... acquired at:
[ 850.699494] _raw_spin_lock+0x30/0x70
[ 850.703378] memcg_event_wake+0x58/0x210
[ 850.707512] __wake_up_common+0x183/0x550
[ 850.711733] __wake_up_common_lock+0xbe/0x100
[ 850.716299] eventfd_release+0x47/0x70
[ 850.720261] __fput+0x256/0x7e0
[ 850.723614] task_work_run+0xfe/0x170
[ 850.727489] do_exit+0x993/0x2b60
[ 850.731014] do_group_exit+0xee/0x2b0
[ 850.734889] get_signal+0x31f/0x18c0
[ 850.738674] do_signal+0x9b/0x16d0
[ 850.742289] exit_to_usermode_loop+0x146/0x1a0
[ 850.746942] do_syscall_64+0x317/0x400
[ 850.750903] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 850.756164]
[ 850.757706]
[ 850.757706] the dependencies between the lock to be acquired
[ 850.757707] and SOFTIRQ-irq-unsafe lock:
[ 850.768925] -> (&(&memcg->event_list_lock)->rlock){+.+.} ops: 4 {
[ 850.775058] HARDIRQ-ON-W at:
[ 850.778238] _raw_spin_lock+0x30/0x70
[ 850.783585] memcg_write_event_control+0x982/0xe60
[ 850.790069] cgroup_file_write+0x260/0x640
[ 850.795851] kernfs_fop_write+0x278/0x400
[ 850.801552] __vfs_write+0xd5/0x5b0
[ 850.807417] vfs_write+0x15d/0x460
[ 850.812505] ksys_write+0xb1/0x170
[ 850.817593] do_syscall_64+0x9b/0x400
[ 850.822941] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 850.829675] SOFTIRQ-ON-W at:
[ 850.832856] _raw_spin_lock+0x30/0x70
[ 850.838203] memcg_write_event_control+0x982/0xe60
[ 850.844676] cgroup_file_write+0x260/0x640
[ 850.850457] kernfs_fop_write+0x278/0x400
[ 850.856152] __vfs_write+0xd5/0x5b0
[ 850.861333] vfs_write+0x15d/0x460
[ 850.866420] ksys_write+0xb1/0x170
[ 850.871507] do_syscall_64+0x9b/0x400
[ 850.876856] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 850.883595] INITIAL USE at:
[ 850.886692] _raw_spin_lock+0x30/0x70
[ 850.891953] memcg_write_event_control+0x982/0xe60
[ 850.898347] cgroup_file_write+0x260/0x640
[ 850.904042] kernfs_fop_write+0x278/0x400
[ 850.909649] __vfs_write+0xd5/0x5b0
[ 850.914736] vfs_write+0x15d/0x460
[ 850.919737] ksys_write+0xb1/0x170
[ 850.924738] do_syscall_64+0x9b/0x400
[ 850.929998] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 850.936643] }
[ 850.938356] ... key at: [<ffffffffbf2a6680>] __key.75511+0x0/0x40
[ 850.945007] ... acquired at:
[ 850.948014] _raw_spin_lock+0x30/0x70
[ 850.951891] memcg_event_wake+0x58/0x210
[ 850.956025] __wake_up_common+0x183/0x550
[ 850.960243] __wake_up_common_lock+0xbe/0x100
[ 850.964811] eventfd_release+0x47/0x70
[ 850.968771] __fput+0x256/0x7e0
[ 850.972126] task_work_run+0xfe/0x170
[ 850.975999] do_exit+0x993/0x2b60
[ 850.979528] do_group_exit+0xee/0x2b0
[ 850.983402] get_signal+0x31f/0x18c0
[ 850.987189] do_signal+0x9b/0x16d0
[ 850.990802] exit_to_usermode_loop+0x146/0x1a0
[ 850.995456] do_syscall_64+0x317/0x400
[ 850.999425] entry_SYSCALL_64_after_hwframe+0x49/0xbe
Signed-off-by: Debabrata Banerjee <dbanerje@xxxxxxxxxx>
---
mm/memcontrol.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 81a0d3914ec9..3faaa6934335 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4016,7 +4016,9 @@ static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
*/
- spin_lock(&memcg->event_list_lock);
+ unsigned long irqflags;
+
+ spin_lock_irqsave(&memcg->event_list_lock, irqflags);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
@@ -4025,7 +4027,7 @@ static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
*/
schedule_work(&event->remove);
}
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irqrestore(&memcg->event_list_lock, irqflags);
}
return 0;
@@ -4062,6 +4064,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
const char *name;
char *endp;
int ret;
+ unsigned long flags;
buf = strstrip(buf);
@@ -4157,9 +4160,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
vfs_poll(efile.file, &event->pt);
- spin_lock(&memcg->event_list_lock);
+ spin_lock_irqsave(&memcg->event_list_lock, flags);
list_add(&event->list, &memcg->event_list);
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irqrestore(&memcg->event_list_lock, flags);
fdput(cfile);
fdput(efile);
@@ -4578,18 +4581,19 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_event *event, *tmp;
+ unsigned long flags;
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
- spin_lock(&memcg->event_list_lock);
+ spin_lock_irqsave(&memcg->event_list_lock, flags);
list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
- spin_unlock(&memcg->event_list_lock);
+ spin_unlock_irqrestore(&memcg->event_list_lock, flags);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
--
2.21.0