Re: [PATCH] locking/local_lock: Reduce local_[un]lock_nested_bh() overhead
From: Eric Dumazet
Date: Mon Mar 09 2026 - 10:24:39 EST
On Mon, Mar 9, 2026 at 3:03 PM Eric Dumazet <edumazet@xxxxxxxxxx> wrote:
>
> On Mon, Mar 9, 2026 at 2:44 PM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> >
> > On Mon, Mar 09, 2026 at 12:20:55PM +0000, Eric Dumazet wrote:
> >
> > > diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h
> > > index b8830148a8591c17c22e36470fbc13ff5c354955..40c2da54a0b720265be7b6327e0922a49befd8fc 100644
> > > --- a/include/linux/local_lock.h
> > > +++ b/include/linux/local_lock.h
> > > @@ -94,12 +94,19 @@ DEFINE_LOCK_GUARD_1(local_lock_irqsave, local_lock_t __percpu,
> > > local_unlock_irqrestore(_T->lock, _T->flags),
> > > unsigned long flags)
> > >
> > > +#if defined(WARN_CONTEXT_ANALYSIS) || defined(CONFIG_PREEMPT_RT) || \
> > > + defined(CONFIG_DEBUG_LOCK_ALLOC)
> > > #define local_lock_nested_bh(_lock) \
> > > __local_lock_nested_bh(__this_cpu_local_lock(_lock))
> > >
> > > #define local_unlock_nested_bh(_lock) \
> > > __local_unlock_nested_bh(__this_cpu_local_lock(_lock))
> > >
> > > +#else
> > > +static inline void local_lock_nested_bh(local_lock_t *_lock) {}
> > > +static inline void local_unlock_nested_bh(local_lock_t *__lock) {}
> > > +#endif
> >
> > This isn't going to work; WARN_CONTEXT_ANALYSIS is unconditional on
> > clang >= 22.1
> >
> > How come that this isn't DCEd properly?
>
> It might be partially done.
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 0e217041958a83d2a3c18de2965808442546c49b..50455951dc38668b0cbbcccdb2c5ce726e3c4da9
> 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -7498,3 +7498,12 @@ struct vlan_type_depth
> __vlan_get_protocol_offset(const struct sk_buff *skb,
> };
> }
> EXPORT_SYMBOL(__vlan_get_protocol_offset);
> +
> +void ericeric(void);
> +void ericeric(void)
> +{
> + local_lock_nested_bh(&napi_alloc_cache.bh_lock);
> + local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
> + local_lock_nested_bh(&napi_alloc_cache.bh_lock);
> + local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
> +}
>
> objdump --disassemble=ericeric -r net/core/skbuff.o
>
> net/core/skbuff.o: file format elf64-x86-64
>
>
> Disassembly of section .text:
>
> 000000000000fe40 <ericeric>:
> fe40: f3 0f 1e fa endbr64
> fe44: e8 00 00 00 00 call fe49 <ericeric+0x9>
> fe45: R_X86_64_PLT32 __fentry__-0x4
> fe49: 65 48 8b 05 00 00 00 mov %gs:0x0(%rip),%rax # fe51
> <ericeric+0x11>
> fe50: 00
> fe4d: R_X86_64_PC32 this_cpu_off-0x4
> fe51: 2e e9 00 00 00 00 cs jmp fe57 <ericeric+0x17>
> fe53: R_X86_64_PLT32 __x86_return_thunk-0x4
>
> Disassembly of section .init.text:
Same for
+
+void ericeric(void);
+void ericeric(void)
+{
+ raw_cpu_read_long(this_cpu_off);
+ raw_cpu_read_long(this_cpu_off);
+}
I am guessing __raw_cpu_read() is forcing the asm ?