Re: [PATCH v3] mm/slab: Annotate kmem_cache_node->list_lock as raw
From: Hyeonggon Yoo
Date: Sat Oct 22 2022 - 01:09:28 EST
On Fri, Oct 21, 2022 at 09:18:12PM +0200, Jiri Kosina wrote:
> From: Jiri Kosina <jkosina@xxxxxxx>
>
> The list_lock can be taken in hardirq context when do_drain() is being
> called via IPI on all cores, and therefore lockdep complains about it,
> because it can't be preempted on PREEMPT_RT.
>
> That's not a real issue, as SLAB can't be built on PREEMPT_RT anyway, but
> we still want to get rid of the warning on non-PREEMPT_RT builds.
>
> Annotate it therefore as a raw lock in order to get rid of he lockdep
> warning below.
>
> =============================
> [ BUG: Invalid wait context ]
> 6.1.0-rc1-00134-ge35184f32151 #4 Not tainted
> -----------------------------
> swapper/3/0 is trying to lock:
> ffff8bc88086dc18 (&parent->list_lock){..-.}-{3:3}, at: do_drain+0x57/0xb0
> other info that might help us debug this:
> context-{2:2}
> no locks held by swapper/3/0.
> stack backtrace:
> CPU: 3 PID: 0 Comm: swapper/3 Not tainted 6.1.0-rc1-00134-ge35184f32151 #4
> Hardware name: LENOVO 20K5S22R00/20K5S22R00, BIOS R0IET38W (1.16 ) 05/31/2017
> Call Trace:
> <IRQ>
> dump_stack_lvl+0x6b/0x9d
> __lock_acquire+0x1519/0x1730
> ? build_sched_domains+0x4bd/0x1590
> ? __lock_acquire+0xad2/0x1730
> lock_acquire+0x294/0x340
> ? do_drain+0x57/0xb0
> ? sched_clock_tick+0x41/0x60
> _raw_spin_lock+0x2c/0x40
> ? do_drain+0x57/0xb0
> do_drain+0x57/0xb0
> __flush_smp_call_function_queue+0x138/0x220
> __sysvec_call_function+0x4f/0x210
> sysvec_call_function+0x4b/0x90
> </IRQ>
> <TASK>
> asm_sysvec_call_function+0x16/0x20
> RIP: 0010:mwait_idle+0x5e/0x80
> Code: 31 d2 65 48 8b 04 25 80 ed 01 00 48 89 d1 0f 01 c8 48 8b 00 a8 08 75 14 66 90 0f 00 2d 0b 78 46 00 31 c0 48 89 c1 fb 0f 01 c9 <eb> 06 fb 0f 1f 44 00 00 65 48 8b 04 25 80 ed 01 00 f0 80 60 02 df
> RSP: 0000:ffffa90940217ee0 EFLAGS: 00000246
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff9bb9f93a
> RBP: 0000000000000003 R08: 0000000000000001 R09: 0000000000000001
> R10: ffffa90940217ea8 R11: 0000000000000000 R12: ffffffffffffffff
> R13: 0000000000000000 R14: ffff8bc88127c500 R15: 0000000000000000
> ? default_idle_call+0x1a/0xa0
> default_idle_call+0x4b/0xa0
> do_idle+0x1f1/0x2c0
> ? _raw_spin_unlock_irqrestore+0x56/0x70
> cpu_startup_entry+0x19/0x20
> start_secondary+0x122/0x150
> secondary_startup_64_no_verify+0xce/0xdb
> </TASK>
>
Looks good to me.
Reviewed-by: Hyeonggon Yoo <42.hyeyoo@xxxxxxxxx>
> Signed-off-by: Jiri Kosina <jkosina@xxxxxxx>
> ---
>
> v1->v2: fix !SLAB build failures due to list_lock mismatch
> v2->v3: really fix it by sending refreshed version of the patch (facepalm)
>
> mm/slab.c | 90 +++++++++++++++++++++++++++----------------------------
> mm/slab.h | 4 +++
> 2 files changed, 49 insertions(+), 45 deletions(-)
>
> diff --git a/mm/slab.c b/mm/slab.c
> index 59c8e28f7b6a..d8a287900193 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -234,7 +234,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
> parent->shared = NULL;
> parent->alien = NULL;
> parent->colour_next = 0;
> - spin_lock_init(&parent->list_lock);
> + raw_spin_lock_init(&parent->list_lock);
> parent->free_objects = 0;
> parent->free_touched = 0;
> }
> @@ -559,9 +559,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
> slab_node = slab_nid(slab);
> n = get_node(cachep, slab_node);
>
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> free_block(cachep, &objp, 1, slab_node, &list);
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
>
> slabs_destroy(cachep, &list);
> }
> @@ -684,7 +684,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
> struct kmem_cache_node *n = get_node(cachep, node);
>
> if (ac->avail) {
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> /*
> * Stuff objects into the remote nodes shared array first.
> * That way we could avoid the overhead of putting the objects
> @@ -695,7 +695,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
>
> free_block(cachep, ac->entry, ac->avail, node, list);
> ac->avail = 0;
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> }
> }
>
> @@ -768,9 +768,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
> slabs_destroy(cachep, &list);
> } else {
> n = get_node(cachep, slab_node);
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> free_block(cachep, &objp, 1, slab_node, &list);
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> slabs_destroy(cachep, &list);
> }
> return 1;
> @@ -811,10 +811,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
> */
> n = get_node(cachep, node);
> if (n) {
> - spin_lock_irq(&n->list_lock);
> + raw_spin_lock_irq(&n->list_lock);
> n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
> cachep->num;
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
>
> return 0;
> }
> @@ -893,7 +893,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
> goto fail;
>
> n = get_node(cachep, node);
> - spin_lock_irq(&n->list_lock);
> + raw_spin_lock_irq(&n->list_lock);
> if (n->shared && force_change) {
> free_block(cachep, n->shared->entry,
> n->shared->avail, node, &list);
> @@ -911,7 +911,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
> new_alien = NULL;
> }
>
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
> slabs_destroy(cachep, &list);
>
> /*
> @@ -950,7 +950,7 @@ static void cpuup_canceled(long cpu)
> if (!n)
> continue;
>
> - spin_lock_irq(&n->list_lock);
> + raw_spin_lock_irq(&n->list_lock);
>
> /* Free limit for this kmem_cache_node */
> n->free_limit -= cachep->batchcount;
> @@ -961,7 +961,7 @@ static void cpuup_canceled(long cpu)
> nc->avail = 0;
>
> if (!cpumask_empty(mask)) {
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
> goto free_slab;
> }
>
> @@ -975,7 +975,7 @@ static void cpuup_canceled(long cpu)
> alien = n->alien;
> n->alien = NULL;
>
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
>
> kfree(shared);
> if (alien) {
> @@ -1159,7 +1159,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *
> /*
> * Do not assume that spinlocks can be initialized via memcpy:
> */
> - spin_lock_init(&ptr->list_lock);
> + raw_spin_lock_init(&ptr->list_lock);
>
> MAKE_ALL_LISTS(cachep, ptr, nodeid);
> cachep->node[nodeid] = ptr;
> @@ -1330,11 +1330,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
> for_each_kmem_cache_node(cachep, node, n) {
> unsigned long total_slabs, free_slabs, free_objs;
>
> - spin_lock_irqsave(&n->list_lock, flags);
> + raw_spin_lock_irqsave(&n->list_lock, flags);
> total_slabs = n->total_slabs;
> free_slabs = n->free_slabs;
> free_objs = n->free_objects;
> - spin_unlock_irqrestore(&n->list_lock, flags);
> + raw_spin_unlock_irqrestore(&n->list_lock, flags);
>
> pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
> node, total_slabs - free_slabs, total_slabs,
> @@ -2096,7 +2096,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
> {
> #ifdef CONFIG_SMP
> check_irq_off();
> - assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
> + assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock);
> #endif
> }
>
> @@ -2104,7 +2104,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
> {
> #ifdef CONFIG_SMP
> check_irq_off();
> - assert_spin_locked(&get_node(cachep, node)->list_lock);
> + assert_raw_spin_locked(&get_node(cachep, node)->list_lock);
> #endif
> }
>
> @@ -2144,9 +2144,9 @@ static void do_drain(void *arg)
> check_irq_off();
> ac = cpu_cache_get(cachep);
> n = get_node(cachep, node);
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> free_block(cachep, ac->entry, ac->avail, node, &list);
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> ac->avail = 0;
> slabs_destroy(cachep, &list);
> }
> @@ -2164,9 +2164,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
> drain_alien_cache(cachep, n->alien);
>
> for_each_kmem_cache_node(cachep, node, n) {
> - spin_lock_irq(&n->list_lock);
> + raw_spin_lock_irq(&n->list_lock);
> drain_array_locked(cachep, n->shared, node, true, &list);
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
>
> slabs_destroy(cachep, &list);
> }
> @@ -2188,10 +2188,10 @@ static int drain_freelist(struct kmem_cache *cache,
> nr_freed = 0;
> while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
>
> - spin_lock_irq(&n->list_lock);
> + raw_spin_lock_irq(&n->list_lock);
> p = n->slabs_free.prev;
> if (p == &n->slabs_free) {
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
> goto out;
> }
>
> @@ -2204,7 +2204,7 @@ static int drain_freelist(struct kmem_cache *cache,
> * to the cache.
> */
> n->free_objects -= cache->num;
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
> slab_destroy(cache, slab);
> nr_freed++;
> }
> @@ -2629,7 +2629,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
> INIT_LIST_HEAD(&slab->slab_list);
> n = get_node(cachep, slab_nid(slab));
>
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> n->total_slabs++;
> if (!slab->active) {
> list_add_tail(&slab->slab_list, &n->slabs_free);
> @@ -2639,7 +2639,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
>
> STATS_INC_GROWN(cachep);
> n->free_objects += cachep->num - slab->active;
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
>
> fixup_objfreelist_debug(cachep, &list);
> }
> @@ -2805,7 +2805,7 @@ static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
> {
> struct slab *slab;
>
> - assert_spin_locked(&n->list_lock);
> + assert_raw_spin_locked(&n->list_lock);
> slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
> slab_list);
> if (!slab) {
> @@ -2832,10 +2832,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
> if (!gfp_pfmemalloc_allowed(flags))
> return NULL;
>
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> slab = get_first_slab(n, true);
> if (!slab) {
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> return NULL;
> }
>
> @@ -2844,7 +2844,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
>
> fixup_slab_list(cachep, n, slab, &list);
>
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> fixup_objfreelist_debug(cachep, &list);
>
> return obj;
> @@ -2903,7 +2903,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
> if (!n->free_objects && (!shared || !shared->avail))
> goto direct_grow;
>
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> shared = READ_ONCE(n->shared);
>
> /* See if we can refill from the shared array */
> @@ -2927,7 +2927,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
> must_grow:
> n->free_objects -= ac->avail;
> alloc_done:
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> fixup_objfreelist_debug(cachep, &list);
>
> direct_grow:
> @@ -3147,7 +3147,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
> BUG_ON(!n);
>
> check_irq_off();
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> slab = get_first_slab(n, false);
> if (!slab)
> goto must_grow;
> @@ -3165,12 +3165,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
>
> fixup_slab_list(cachep, n, slab, &list);
>
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> fixup_objfreelist_debug(cachep, &list);
> return obj;
>
> must_grow:
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
> if (slab) {
> /* This slab isn't counted yet so don't update free_objects */
> @@ -3325,7 +3325,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
>
> check_irq_off();
> n = get_node(cachep, node);
> - spin_lock(&n->list_lock);
> + raw_spin_lock(&n->list_lock);
> if (n->shared) {
> struct array_cache *shared_array = n->shared;
> int max = shared_array->limit - shared_array->avail;
> @@ -3354,7 +3354,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
> STATS_SET_FREEABLE(cachep, i);
> }
> #endif
> - spin_unlock(&n->list_lock);
> + raw_spin_unlock(&n->list_lock);
> ac->avail -= batchcount;
> memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
> slabs_destroy(cachep, &list);
> @@ -3721,9 +3721,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
>
> node = cpu_to_mem(cpu);
> n = get_node(cachep, node);
> - spin_lock_irq(&n->list_lock);
> + raw_spin_lock_irq(&n->list_lock);
> free_block(cachep, ac->entry, ac->avail, node, &list);
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
> slabs_destroy(cachep, &list);
> }
> free_percpu(prev);
> @@ -3815,9 +3815,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
> return;
> }
>
> - spin_lock_irq(&n->list_lock);
> + raw_spin_lock_irq(&n->list_lock);
> drain_array_locked(cachep, ac, node, false, &list);
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
>
> slabs_destroy(cachep, &list);
> }
> @@ -3901,7 +3901,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
>
> for_each_kmem_cache_node(cachep, node, n) {
> check_irq_on();
> - spin_lock_irq(&n->list_lock);
> + raw_spin_lock_irq(&n->list_lock);
>
> total_slabs += n->total_slabs;
> free_slabs += n->free_slabs;
> @@ -3910,7 +3910,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
> if (n->shared)
> shared_avail += n->shared->avail;
>
> - spin_unlock_irq(&n->list_lock);
> + raw_spin_unlock_irq(&n->list_lock);
> }
> num_objs = total_slabs * cachep->num;
> active_slabs = total_slabs - free_slabs;
> diff --git a/mm/slab.h b/mm/slab.h
> index 0202a8c2f0d2..7a705e4228c8 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -750,7 +750,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
> * The slab lists for all objects.
> */
> struct kmem_cache_node {
> +#ifdef CONFIG_SLAB
> + raw_spinlock_t list_lock;
> +#else
> spinlock_t list_lock;
> +#endif
>
> #ifdef CONFIG_SLAB
> struct list_head slabs_partial; /* partial list first, better asm code */
> --
> 2.35.3
>
--
Thanks,
Hyeonggon