Re: [PATCH v4 29/35] mm: slub: Move flush_cpu_slab() invocations __free_slab() invocations out of IRQ context

From: Mike Galbraith
Date: Mon Aug 09 2021 - 14:44:54 EST


On Mon, 2021-08-09 at 09:41 -0400, Qian Cai wrote:
>
>
> On 8/5/2021 11:19 AM, Vlastimil Babka wrote:
> >
> >  
> > +static DEFINE_MUTEX(flush_lock);
> > +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
> > +
> >  static void flush_all(struct kmem_cache *s)
> >  {
> > -       on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
> > +       struct slub_flush_work *sfw;
> > +       unsigned int cpu;
> > +
> > +       mutex_lock(&flush_lock);
>
> Vlastimil, taking the lock here could trigger a warning during memory
> offline/online due to the locking order:
>
> slab_mutex -> flush_lock

Bugger. That chain ending with cpu_hotplug_lock makes slub_cpu_dead()
taking slab_mutex a non-starter for cpu hotplug as well. It's
established early by kernel_init_freeable()..kmem_cache_destroy() as
well as by slab_mem_going_offline_callback().

> [   91.374541] WARNING: possible circular locking dependency detected
> [   91.381411] 5.14.0-rc5-next-20210809+ #84 Not tainted
> [   91.387149] ------------------------------------------------------
> [   91.394016] lsbug/1523 is trying to acquire lock:
> [   91.399406] ffff800018e76530 (flush_lock){+.+.}-{3:3}, at:
> flush_all+0x50/0x1c8
> [   91.407425]
>                but task is already holding lock:
> [   91.414638] ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at:
> slab_memory_callback+0x44/0x280
> [   91.423603]
>                which lock already depends on the new lock.
>
> [   91.433854]
>                the existing dependency chain (in reverse order) is:
> [   91.442715]
>                -> #4 (slab_mutex){+.+.}-{3:3}:
> [   91.449766]        __lock_acquire+0xb0c/0x1aa8
> [   91.454901]        lock_acquire+0x34c/0xb20
> [   91.459773]        __mutex_lock+0x194/0x1470
> [   91.464732]        mutex_lock_nested+0x6c/0xc0
> [   91.469864]        slab_memory_callback+0x44/0x280
> [   91.475344]        blocking_notifier_call_chain+0xd0/0x138
> [   91.481519]        memory_notify+0x28/0x38
> [   91.486304]        offline_pages+0x2cc/0xce4
> [   91.491262]        memory_subsys_offline+0xd8/0x280
> [   91.496827]        device_offline+0x154/0x1e0
> [   91.501872]        online_store+0xa4/0x118
> [   91.506656]        dev_attr_store+0x44/0x78
> [   91.511527]        sysfs_kf_write+0xe8/0x138
> [   91.516485]        kernfs_fop_write_iter+0x26c/0x3d0
> [   91.522138]        new_sync_write+0x2bc/0x4f8
> [   91.527185]        vfs_write+0x718/0xc88
> [   91.531795]        ksys_write+0xf8/0x1e0
> [   91.536404]        __arm64_sys_write+0x74/0xa8
> [   91.541535]        invoke_syscall.constprop.0+0xdc/0x1d8
> [   91.547536]        do_el0_svc+0xe4/0x2a8
> [   91.552146]        el0_svc+0x64/0x130
> [   91.556498]        el0t_64_sync_handler+0xb0/0xb8
> [   91.561889]        el0t_64_sync+0x180/0x184
> [   91.566760]
>                -> #3 ((memory_chain).rwsem){++++}-{3:3}:
> [   91.574680]        __lock_acquire+0xb0c/0x1aa8
> [   91.579814]        lock_acquire+0x34c/0xb20
> [   91.584685]        down_read+0xf0/0x488
> [   91.589210]        blocking_notifier_call_chain+0x58/0x138
> [   91.595383]        memory_notify+0x28/0x38
> [   91.600167]        offline_pages+0x2cc/0xce4
> [   91.605124]        memory_subsys_offline+0xd8/0x280
> [   91.610689]        device_offline+0x154/0x1e0
> [   91.615734]        online_store+0xa4/0x118
> [   91.620518]        dev_attr_store+0x44/0x78
> [   91.625388]        sysfs_kf_write+0xe8/0x138
> [   91.630346]        kernfs_fop_write_iter+0x26c/0x3d0
> [   91.635997]        new_sync_write+0x2bc/0x4f8
> [   91.641043]        vfs_write+0x718/0xc88
> [   91.645652]        ksys_write+0xf8/0x1e0
> [   91.650262]        __arm64_sys_write+0x74/0xa8
> [   91.655393]        invoke_syscall.constprop.0+0xdc/0x1d8
> [   91.661394]        do_el0_svc+0xe4/0x2a8
> [   91.666004]        el0_svc+0x64/0x130
> [   91.670355]        el0t_64_sync_handler+0xb0/0xb8
> [   91.675747]        el0t_64_sync+0x180/0x184
> [   91.680617]
>                -> #2 (pcp_batch_high_lock){+.+.}-{3:3}:
> [   91.688449]        __lock_acquire+0xb0c/0x1aa8
> [   91.693582]        lock_acquire+0x34c/0xb20
> [   91.698452]        __mutex_lock+0x194/0x1470
> [   91.703410]        mutex_lock_nested+0x6c/0xc0
> [   91.708541]        zone_pcp_update+0x3c/0x68
> [   91.713500]        page_alloc_cpu_online+0x64/0x90
> [   91.718978]        cpuhp_invoke_callback+0x588/0x2ba8
> [   91.724718]        cpuhp_invoke_callback_range+0xa4/0x108
> [   91.730804]        cpu_up+0x598/0xb78
> [   91.735154]        bringup_nonboot_cpus+0x110/0x168
> [   91.740719]        smp_init+0x4c/0xe0
> [   91.745070]        kernel_init_freeable+0x554/0x7c8
> [   91.750637]        kernel_init+0x2c/0x140
> [   91.755334]        ret_from_fork+0x10/0x20
> [   91.760118]
>                -> #1 (cpu_hotplug_lock){++++}-{0:0}:
> [   91.767688]        __lock_acquire+0xb0c/0x1aa8
> [   91.772820]        lock_acquire+0x34c/0xb20
> [   91.777691]        cpus_read_lock+0x98/0x308
> [   91.782649]        flush_all+0x54/0x1c8
> [   91.787173]        __kmem_cache_shrink+0x38/0x2f0
> [   91.792566]        kmem_cache_shrink+0x28/0x38
> [   91.797699]        acpi_os_purge_cache+0x18/0x28
> [   91.803006]        acpi_purge_cached_objects+0x44/0xdc
> [   91.808832]        acpi_initialize_objects+0x24/0x88
> [   91.814487]        acpi_bus_init+0xe0/0x47c
> [   91.819357]        acpi_init+0x130/0x27c
> [   91.823967]        do_one_initcall+0x180/0xbe8
> [   91.829098]        kernel_init_freeable+0x710/0x7c8
> [   91.834663]        kernel_init+0x2c/0x140
> [   91.839360]        ret_from_fork+0x10/0x20
> [   91.844143]
>                -> #0 (flush_lock){+.+.}-{3:3}:
> [   91.851193]        check_prev_add+0x194/0x1170
> [   91.856326]        validate_chain+0xfe8/0x1c20
> [   91.861458]        __lock_acquire+0xb0c/0x1aa8
> [   91.866589]        lock_acquire+0x34c/0xb20
> [   91.871460]        __mutex_lock+0x194/0x1470
> [   91.876418]        mutex_lock_nested+0x6c/0xc0
> [   91.881549]        flush_all+0x50/0x1c8
> [   91.886072]        __kmem_cache_shrink+0x38/0x2f0
> [   91.891465]        slab_memory_callback+0x68/0x280
> [   91.896943]        blocking_notifier_call_chain+0xd0/0x138
> [   91.903117]        memory_notify+0x28/0x38
> [   91.907901]        offline_pages+0x2cc/0xce4
> [   91.912859]        memory_subsys_offline+0xd8/0x280
> [   91.918424]        device_offline+0x154/0x1e0
> [   91.923470]        online_store+0xa4/0x118
> [   91.928254]        dev_attr_store+0x44/0x78
> [   91.933125]        sysfs_kf_write+0xe8/0x138
> [   91.938083]        kernfs_fop_write_iter+0x26c/0x3d0
> [   91.943735]        new_sync_write+0x2bc/0x4f8
> [   91.948781]        vfs_write+0x718/0xc88
> [   91.953391]        ksys_write+0xf8/0x1e0
> [   91.958000]        __arm64_sys_write+0x74/0xa8
> [   91.963130]        invoke_syscall.constprop.0+0xdc/0x1d8
> [   91.969131]        do_el0_svc+0xe4/0x2a8
> [   91.973741]        el0_svc+0x64/0x130
> [   91.978093]        el0t_64_sync_handler+0xb0/0xb8
> [   91.983484]        el0t_64_sync+0x180/0x184
> [   91.988354]
>                other info that might help us debug this:
>
> [   91.998431] Chain exists of:
>                  flush_lock --> (memory_chain).rwsem --> slab_mutex
>
> [   92.010867]  Possible unsafe locking scenario:
>
> [   92.018166]        CPU0                    CPU1
> [   92.023380]        ----                    ----
> [   92.028595]   lock(slab_mutex);
> [   92.032425]                               
> lock((memory_chain).rwsem);
> [   92.039641]                                lock(slab_mutex);
> [   92.045989]   lock(flush_lock);
> [   92.049819]
>                 *** DEADLOCK ***
>
> [   92.057811] 10 locks held by lsbug/1523:
> [   92.062420]  #0: ffff0000505a8430 (sb_writers#6){.+.+}-{0:0}, at:
> ksys_write+0xf8/0x1e0
> [   92.071128]  #1: ffff000870f99e88 (&of->mutex){+.+.}-{3:3}, at:
> kernfs_fop_write_iter+0x1dc/0x3d0
> [   92.080701]  #2: ffff0000145b2ab8 (kn->active#175){.+.+}-{0:0},
> at: kernfs_fop_write_iter+0x1f8/0x3d0
> [   92.090623]  #3: ffff800018f84f08 (device_hotplug_lock){+.+.}-
> {3:3}, at: lock_device_hotplug_sysfs+0x24/0x88
> [   92.101151]  #4: ffff0000145e9190 (&dev->mutex){....}-{3:3}, at:
> device_offline+0xa0/0x1e0
> [   92.110115]  #5: ffff800011d26450 (cpu_hotplug_lock){++++}-{0:0},
> at: offline_pages+0x10c/0xce4
> [   92.119514]  #6: ffff800018e60570 (mem_hotplug_lock){++++}-{0:0},
> at: offline_pages+0x11c/0xce4
> [   92.128919]  #7: ffff800018e5bb68 (pcp_batch_high_lock){+.+.}-
> {3:3}, at: zone_pcp_disable+0x30/0x60
> [   92.138668]  #8: ffff800018fa0610 ((memory_chain).rwsem){++++}-
> {3:3}, at: blocking_notifier_call_chain+0x58/0x138
> [   92.149633]  #9: ffff800018e48468 (slab_mutex){+.+.}-{3:3}, at:
> slab_memory_callback+0x44/0x280
> [   92.159033]
>                stack backtrace:
> [   92.164772] CPU: 29 PID: 1523 Comm: lsbug Not tainted 5.14.0-rc5-
> next-20210809+ #84
> [   92.173116] Hardware name: MiTAC RAPTOR EV-883832-X3-0001/RAPTOR,
> BIOS 1.6 06/28/2020
> [   92.181631] Call trace:
> [   92.184763]  dump_backtrace+0x0/0x3b8
> [   92.189115]  show_stack+0x20/0x30
> [   92.193118]  dump_stack_lvl+0x8c/0xb8
> [   92.197469]  dump_stack+0x1c/0x38
> [   92.201472]  print_circular_bug.isra.0+0x530/0x540
> [   92.206953]  check_noncircular+0x27c/0x2f0
> [   92.211738]  check_prev_add+0x194/0x1170
> [   92.216349]  validate_chain+0xfe8/0x1c20
> [   92.220961]  __lock_acquire+0xb0c/0x1aa8
> [   92.225571]  lock_acquire+0x34c/0xb20
> [   92.229921]  __mutex_lock+0x194/0x1470
> [   92.234358]  mutex_lock_nested+0x6c/0xc0
> [   92.238968]  flush_all+0x50/0x1c8
> flush_all at /usr/src/linux-next/mm/slub.c:2649
> [   92.242971]  __kmem_cache_shrink+0x38/0x2f0
> [   92.247842]  slab_memory_callback+0x68/0x280
> slab_mem_going_offline_callback at /usr/src/linux-next/mm/slub.c:4586
> (inlined by) slab_memory_callback at /usr/src/linux-
> next/mm/slub.c:4678
> [   92.252800]  blocking_notifier_call_chain+0xd0/0x138
> notifier_call_chain at /usr/src/linux-next/kernel/notifier.c:83
> (inlined by) blocking_notifier_call_chain at /usr/src/linux-
> next/kernel/notifier.c:337
> (inlined by) blocking_notifier_call_chain at /usr/src/linux-
> next/kernel/notifier.c:325
> [   92.258453]  memory_notify+0x28/0x38
> [   92.262717]  offline_pages+0x2cc/0xce4
> [   92.267153]  memory_subsys_offline+0xd8/0x280
> [   92.272198]  device_offline+0x154/0x1e0
> [   92.276723]  online_store+0xa4/0x118
> [   92.280986]  dev_attr_store+0x44/0x78
> [   92.285336]  sysfs_kf_write+0xe8/0x138
> [   92.289774]  kernfs_fop_write_iter+0x26c/0x3d0
> [   92.294906]  new_sync_write+0x2bc/0x4f8
> [   92.299431]  vfs_write+0x718/0xc88
> [   92.303520]  ksys_write+0xf8/0x1e0
> [   92.307608]  __arm64_sys_write+0x74/0xa8
> [   92.312219]  invoke_syscall.constprop.0+0xdc/0x1d8
> [   92.317698]  do_el0_svc+0xe4/0x2a8
> [   92.321789]  el0_svc+0x64/0x130
> [   92.325619]  el0t_64_sync_handler+0xb0/0xb8
> [   92.330489]  el0t_64_sync+0x180/0x184
>
> > +       cpus_read_lock();
> > +
> > +       for_each_online_cpu(cpu) {
> > +               sfw = &per_cpu(slub_flush, cpu);
> > +               if (!has_cpu_slab(cpu, s)) {
> > +                       sfw->skip = true;
> > +                       continue;
> > +               }
> > +               INIT_WORK(&sfw->work, flush_cpu_slab);
> > +               sfw->skip = false;
> > +               sfw->s = s;
> > +               schedule_work_on(cpu, &sfw->work);
> > +       }
> > +
> > +       for_each_online_cpu(cpu) {
> > +               sfw = &per_cpu(slub_flush, cpu);
> > +               if (sfw->skip)
> > +                       continue;
> > +               flush_work(&sfw->work);
> > +       }
> > +
> > +       cpus_read_unlock();
> > +       mutex_unlock(&flush_lock);
> >  }
> >  
> >  /*
> >