Re: [PATCH v4 29/35] mm: slub: Move flush_cpu_slab() invocations __free_slab() invocations out of IRQ context

From: Qian Cai
Date: Mon Aug 09 2021 - 09:41:53 EST




On 8/5/2021 11:19 AM, Vlastimil Babka wrote:
> From: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
>
> flush_all() flushes a specific SLAB cache on each CPU (where the cache
> is present). The deactivate_slab()/__free_slab() invocation happens
> within IPI handler and is problematic for PREEMPT_RT.
>
> The flush operation is not a frequent operation or a hot path. The
> per-CPU flush operation can be moved to within a workqueue.
>
> [vbabka@xxxxxxx: adapt to new SLUB changes]
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
> Signed-off-by: Vlastimil Babka <vbabka@xxxxxxx>
> ---
> mm/slub.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 48 insertions(+), 8 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index dceb289cb052..da48ada3d17f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -2513,33 +2513,73 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
> unfreeze_partials_cpu(s, c);
> }
>
> +struct slub_flush_work {
> + struct work_struct work;
> + struct kmem_cache *s;
> + bool skip;
> +};
> +
> /*
> * Flush cpu slab.
> *
> - * Called from IPI handler with interrupts disabled.
> + * Called from CPU work handler with migration disabled.
> */
> -static void flush_cpu_slab(void *d)
> +static void flush_cpu_slab(struct work_struct *w)
> {
> - struct kmem_cache *s = d;
> - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
> + struct kmem_cache *s;
> + struct kmem_cache_cpu *c;
> + struct slub_flush_work *sfw;
> +
> + sfw = container_of(w, struct slub_flush_work, work);
> +
> + s = sfw->s;
> + c = this_cpu_ptr(s->cpu_slab);
>
> if (c->page)
> - flush_slab(s, c, false);
> + flush_slab(s, c, true);
>
> unfreeze_partials(s);
> }
>
> -static bool has_cpu_slab(int cpu, void *info)
> +static bool has_cpu_slab(int cpu, struct kmem_cache *s)
> {
> - struct kmem_cache *s = info;
> struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
>
> return c->page || slub_percpu_partial(c);
> }
>
> +static DEFINE_MUTEX(flush_lock);
> +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> +
> static void flush_all(struct kmem_cache *s)
> {
> - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> + struct slub_flush_work *sfw;
> + unsigned int cpu;
> +
> + mutex_lock(&flush_lock);

Vlastimil, taking the lock here could trigger a warning during memory offline/online due to the locking order:

slab_mutex -> flush_lock

[ 91.374541] WARNING: possible circular locking dependency detected
[ 91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
[ 91.387149] ------------------------------------------------------
[ 91.394016] lsbug/1523 is trying to acquire lock:
[ 91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at: flush_all+0x50/0x1c8
[ 91.407425]
but task is already holding lock:
[ 91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
[ 91.423603]
which lock already depends on the new lock.

[ 91.433854]
the existing dependency chain (in reverse order) is:
[ 91.442715]
-> #4 (slab_mutex){+.+.}-{3:3}:
[ 91.449766] __lock_acquire+0xb0c/0x1aa8
[ 91.454901] lock_acquire+0x34c/0xb20
[ 91.459773] __mutex_lock+0x194/0x1470
[ 91.464732] mutex_lock_nested+0x6c/0xc0
[ 91.469864] slab_memory_callback+0x44/0x280
[ 91.475344] blocking_notifier_call_chain+0xd0/0x138
[ 91.481519] memory_notify+0x28/0x38
[ 91.486304] offline_pages+0x2cc/0xce4
[ 91.491262] memory_subsys_offline+0xd8/0x280
[ 91.496827] device_offline+0x154/0x1e0
[ 91.501872] online_store+0xa4/0x118
[ 91.506656] dev_attr_store+0x44/0x78
[ 91.511527] sysfs_kf_write+0xe8/0x138
[ 91.516485] kernfs_fop_write_iter+0x26c/0x3d0
[ 91.522138] new_sync_write+0x2bc/0x4f8
[ 91.527185] vfs_write+0x718/0xc88
[ 91.531795] ksys_write+0xf8/0x1e0
[ 91.536404] __arm64_sys_write+0x74/0xa8
[ 91.541535] invoke_syscall.constprop.0+0xdc/0x1d8
[ 91.547536] do_el0_svc+0xe4/0x2a8
[ 91.552146] el0_svc+0x64/0x130
[ 91.556498] el0t_64_sync_handler+0xb0/0xb8
[ 91.561889] el0t_64_sync+0x180/0x184
[ 91.566760]
-> #3 ((memory_chain).rwsem){++++}-{3:3}:
[ 91.574680] __lock_acquire+0xb0c/0x1aa8
[ 91.579814] lock_acquire+0x34c/0xb20
[ 91.584685] down_read+0xf0/0x488
[ 91.589210] blocking_notifier_call_chain+0x58/0x138
[ 91.595383] memory_notify+0x28/0x38
[ 91.600167] offline_pages+0x2cc/0xce4
[ 91.605124] memory_subsys_offline+0xd8/0x280
[ 91.610689] device_offline+0x154/0x1e0
[ 91.615734] online_store+0xa4/0x118
[ 91.620518] dev_attr_store+0x44/0x78
[ 91.625388] sysfs_kf_write+0xe8/0x138
[ 91.630346] kernfs_fop_write_iter+0x26c/0x3d0
[ 91.635997] new_sync_write+0x2bc/0x4f8
[ 91.641043] vfs_write+0x718/0xc88
[ 91.645652] ksys_write+0xf8/0x1e0
[ 91.650262] __arm64_sys_write+0x74/0xa8
[ 91.655393] invoke_syscall.constprop.0+0xdc/0x1d8
[ 91.661394] do_el0_svc+0xe4/0x2a8
[ 91.666004] el0_svc+0x64/0x130
[ 91.670355] el0t_64_sync_handler+0xb0/0xb8
[ 91.675747] el0t_64_sync+0x180/0x184
[ 91.680617]
-> #2 (pcp_batch_high_lock){+.+.}-{3:3}:
[ 91.688449] __lock_acquire+0xb0c/0x1aa8
[ 91.693582] lock_acquire+0x34c/0xb20
[ 91.698452] __mutex_lock+0x194/0x1470
[ 91.703410] mutex_lock_nested+0x6c/0xc0
[ 91.708541] zone_pcp_update+0x3c/0x68
[ 91.713500] page_alloc_cpu_online+0x64/0x90
[ 91.718978] cpuhp_invoke_callback+0x588/0x2ba8
[ 91.724718] cpuhp_invoke_callback_range+0xa4/0x108
[ 91.730804] cpu_up+0x598/0xb78
[ 91.735154] bringup_nonboot_cpus+0x110/0x168
[ 91.740719] smp_init+0x4c/0xe0
[ 91.745070] kernel_init_freeable+0x554/0x7c8
[ 91.750637] kernel_init+0x2c/0x140
[ 91.755334] ret_from_fork+0x10/0x20
[ 91.760118]
-> #1 (cpu_hotplug_lock){++++}-{0:0}:
[ 91.767688] __lock_acquire+0xb0c/0x1aa8
[ 91.772820] lock_acquire+0x34c/0xb20
[ 91.777691] cpus_read_lock+0x98/0x308
[ 91.782649] flush_all+0x54/0x1c8
[ 91.787173] __kmem_cache_shrink+0x38/0x2f0
[ 91.792566] kmem_cache_shrink+0x28/0x38
[ 91.797699] acpi_os_purge_cache+0x18/0x28
[ 91.803006] acpi_purge_cached_objects+0x44/0xdc
[ 91.808832] acpi_initialize_objects+0x24/0x88
[ 91.814487] acpi_bus_init+0xe0/0x47c
[ 91.819357] acpi_init+0x130/0x27c
[ 91.823967] do_one_initcall+0x180/0xbe8
[ 91.829098] kernel_init_freeable+0x710/0x7c8
[ 91.834663] kernel_init+0x2c/0x140
[ 91.839360] ret_from_fork+0x10/0x20
[ 91.844143]
-> #0 (flush_lock){+.+.}-{3:3}:
[ 91.851193] check_prev_add+0x194/0x1170
[ 91.856326] validate_chain+0xfe8/0x1c20
[ 91.861458] __lock_acquire+0xb0c/0x1aa8
[ 91.866589] lock_acquire+0x34c/0xb20
[ 91.871460] __mutex_lock+0x194/0x1470
[ 91.876418] mutex_lock_nested+0x6c/0xc0
[ 91.881549] flush_all+0x50/0x1c8
[ 91.886072] __kmem_cache_shrink+0x38/0x2f0
[ 91.891465] slab_memory_callback+0x68/0x280
[ 91.896943] blocking_notifier_call_chain+0xd0/0x138
[ 91.903117] memory_notify+0x28/0x38
[ 91.907901] offline_pages+0x2cc/0xce4
[ 91.912859] memory_subsys_offline+0xd8/0x280
[ 91.918424] device_offline+0x154/0x1e0
[ 91.923470] online_store+0xa4/0x118
[ 91.928254] dev_attr_store+0x44/0x78
[ 91.933125] sysfs_kf_write+0xe8/0x138
[ 91.938083] kernfs_fop_write_iter+0x26c/0x3d0
[ 91.943735] new_sync_write+0x2bc/0x4f8
[ 91.948781] vfs_write+0x718/0xc88
[ 91.953391] ksys_write+0xf8/0x1e0
[ 91.958000] __arm64_sys_write+0x74/0xa8
[ 91.963130] invoke_syscall.constprop.0+0xdc/0x1d8
[ 91.969131] do_el0_svc+0xe4/0x2a8
[ 91.973741] el0_svc+0x64/0x130
[ 91.978093] el0t_64_sync_handler+0xb0/0xb8
[ 91.983484] el0t_64_sync+0x180/0x184
[ 91.988354]
other info that might help us debug this:

[ 91.998431] Chain exists of:
flush_lock --> (memory_chain).rwsem --> slab_mutex

[ 92.010867] Possible unsafe locking scenario:

[ 92.018166] CPU0 CPU1
[ 92.023380] ---- ----
[ 92.028595] lock(slab_mutex);
[ 92.032425] lock((memory_chain).rwsem);
[ 92.039641] lock(slab_mutex);
[ 92.045989] lock(flush_lock);
[ 92.049819]
*** DEADLOCK ***

[ 92.057811] 10 locks held by lsbug/1523:
[ 92.062420] #0: ffff0000505a8430 (sb_writers#6){.+.+}-{0:0}, at: ksys_write+0xf8/0x1e0
[ 92.071128] #1: ffff000870f99e88 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x1dc/0x3d0
[ 92.080701] #2: ffff0000145b2ab8 (kn->active#175){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x1f8/0x3d0
[ 92.090623] #3: ffff800018f84f08 (device_hotplug_lock){+.+.}-{3:3}, at: lock_device_hotplug_sysfs+0x24/0x88
[ 92.101151] #4: ffff0000145e9190 (&dev->mutex){....}-{3:3}, at: device_offline+0xa0/0x1e0
[ 92.110115] #5: ffff800011d26450 (cpu_hotplug_lock){++++}-{0:0}, at: offline_pages+0x10c/0xce4
[ 92.119514] #6: ffff800018e60570 (mem_hotplug_lock){++++}-{0:0}, at: offline_pages+0x11c/0xce4
[ 92.128919] #7: ffff800018e5bb68 (pcp_batch_high_lock){+.+.}-{3:3}, at: zone_pcp_disable+0x30/0x60
[ 92.138668] #8: ffff800018fa0610 ((memory_chain).rwsem){++++}-{3:3}, at: blocking_notifier_call_chain+0x58/0x138
[ 92.149633] #9: ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at: slab_memory_callback+0x44/0x280
[ 92.159033]
stack backtrace:
[ 92.164772] CPU: 29 PID: 1523 Comm: lsbug Not tainted 5.14.0-rc5-next-20210809+ #84
[ 92.173116] Hardware name: MiTAC RAPTOR EV-883832-X3-0001/RAPTOR, BIOS 1.6 06/28/2020
[ 92.181631] Call trace:
[ 92.184763] dump_backtrace+0x0/0x3b8
[ 92.189115] show_stack+0x20/0x30
[ 92.193118] dump_stack_lvl+0x8c/0xb8
[ 92.197469] dump_stack+0x1c/0x38
[ 92.201472] print_circular_bug.isra.0+0x530/0x540
[ 92.206953] check_noncircular+0x27c/0x2f0
[ 92.211738] check_prev_add+0x194/0x1170
[ 92.216349] validate_chain+0xfe8/0x1c20
[ 92.220961] __lock_acquire+0xb0c/0x1aa8
[ 92.225571] lock_acquire+0x34c/0xb20
[ 92.229921] __mutex_lock+0x194/0x1470
[ 92.234358] mutex_lock_nested+0x6c/0xc0
[ 92.238968] flush_all+0x50/0x1c8
flush_all at /usr/src/linux-next/mm/slub.c:2649
[ 92.242971] __kmem_cache_shrink+0x38/0x2f0
[ 92.247842] slab_memory_callback+0x68/0x280
slab_mem_going_offline_callback at /usr/src/linux-next/mm/slub.c:4586
(inlined by) slab_memory_callback at /usr/src/linux-next/mm/slub.c:4678
[ 92.252800] blocking_notifier_call_chain+0xd0/0x138
notifier_call_chain at /usr/src/linux-next/kernel/notifier.c:83
(inlined by) blocking_notifier_call_chain at /usr/src/linux-next/kernel/notifier.c:337
(inlined by) blocking_notifier_call_chain at /usr/src/linux-next/kernel/notifier.c:325
[ 92.258453] memory_notify+0x28/0x38
[ 92.262717] offline_pages+0x2cc/0xce4
[ 92.267153] memory_subsys_offline+0xd8/0x280
[ 92.272198] device_offline+0x154/0x1e0
[ 92.276723] online_store+0xa4/0x118
[ 92.280986] dev_attr_store+0x44/0x78
[ 92.285336] sysfs_kf_write+0xe8/0x138
[ 92.289774] kernfs_fop_write_iter+0x26c/0x3d0
[ 92.294906] new_sync_write+0x2bc/0x4f8
[ 92.299431] vfs_write+0x718/0xc88
[ 92.303520] ksys_write+0xf8/0x1e0
[ 92.307608] __arm64_sys_write+0x74/0xa8
[ 92.312219] invoke_syscall.constprop.0+0xdc/0x1d8
[ 92.317698] do_el0_svc+0xe4/0x2a8
[ 92.321789] el0_svc+0x64/0x130
[ 92.325619] el0t_64_sync_handler+0xb0/0xb8
[ 92.330489] el0t_64_sync+0x180/0x184

> + cpus_read_lock();
> +
> + for_each_online_cpu(cpu) {
> + sfw = &per_cpu(slub_flush, cpu);
> + if (!has_cpu_slab(cpu, s)) {
> + sfw->skip = true;
> + continue;
> + }
> + INIT_WORK(&sfw->work, flush_cpu_slab);
> + sfw->skip = false;
> + sfw->s = s;
> + schedule_work_on(cpu, &sfw->work);
> + }
> +
> + for_each_online_cpu(cpu) {
> + sfw = &per_cpu(slub_flush, cpu);
> + if (sfw->skip)
> + continue;
> + flush_work(&sfw->work);
> + }
> +
> + cpus_read_unlock();
> + mutex_unlock(&flush_lock);
> }
>
> /*
>