Re: [PATCH v9 net-next 08/15] net: softnet_data: Make xmit per task.
From: Sebastian Andrzej Siewior
Date: Mon Jun 24 2024 - 06:20:36 EST
On 2024-06-21 19:12:45 [-0700], Jakub Kicinski wrote:
> On Thu, 20 Jun 2024 15:21:58 +0200 Sebastian Andrzej Siewior wrote:
> > +static inline void netdev_xmit_set_more(bool more)
> > +{
> > + current->net_xmit.more = more;
> > +}
> > +
> > +static inline bool netdev_xmit_more(void)
> > +{
> > + return current->net_xmit.more;
> > +}
> > +#endif
> > +
> > +static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
> > + struct sk_buff *skb, struct net_device *dev,
> > + bool more)
> > +{
> > + netdev_xmit_set_more(more);
> > + return ops->ndo_start_xmit(skb, dev);
> > +}
>
> The series looks clean, I'm happy for it to be applied as is.
>
> But I'm curious whether similar helper organization as with the BPF
> code would work. By which I mean - instead of read / write helpers
> for each member can we not have one helper which returns the struct?
> It would be a per-CPU struct on !RT and pointer from current on RT.
> Does it change the generated code? Or stripping the __percpu annotation
> is a PITA?
You are asking for
| #ifndef CONFIG_PREEMPT_RT
| static inline struct netdev_xmit *netdev_get_xmit(void)
| {
| return this_cpu_ptr(&softnet_data.xmit);
| }
| #else
| static inline int netdev_get_xmit(void)
| {
| return ¤t->net_xmit;
| }
| #endif
on one side so that we can have then
| static inline void dev_xmit_recursion_inc(void)
| {
| netdev_get_xmit()->recursion++;
| }
|
| static inline void dev_xmit_recursion_dec(void)
| {
| netdev_get_xmit()->recursion--;
| }
This changes the generated code slightly. The inc increases from one to
two opcodes, __dev_direct_xmit() snippet:
| addl $512, %gs:pcpu_hot+8(%rip) #, *_45
local_bh_disable();
| incw %gs:softnet_data+120(%rip) # *_44
dev_xmit_recursion_inc();
| testb $16, 185(%rbx) #, dev_24->features
| je .L3310 #,
| movl $16, %r13d #, <retval>
| testb $5, 208(%r12) #, MEM[(const struct netdev_queue *)_54].state
| je .L3290 #,
| movl $512, %esi #,
^ part of local_bh_enable();
| decw %gs:softnet_data+120(%rip) # *_44
dev_xmit_recursion_dec();
| lea 0(%rip), %rdi # __here
| call __local_bh_enable_ip #
With the change mentioned above we get:
| addl $512, %gs:pcpu_hot+8(%rip) #, *_51
local_bh_disable();
| movq %gs:this_cpu_off(%rip), %rax # *_44, tcp_ptr__
| addw $1, softnet_data+120(%rax) #, _48->recursion
two opcodes for dev_xmit_recursion_inc()
| testb $16, 185(%rbx) #, dev_24->features
| je .L3310 #,
| movl $16, %r13d #, <retval>
| testb $5, 208(%r12) #, MEM[(const struct netdev_queue *)_60].state
| je .L3290 #,
| movq %gs:this_cpu_off(%rip), %rax # *_44, tcp_ptr__
one opcode from dev_xmit_recursion_dec()
| movl $512, %esi #,
part of local_bh_enable()
| lea 0(%rip), %rdi # __here
| subw $1, softnet_data+120(%rax) #, _68->recursion
second opcode from dev_xmit_recursion_dec()
| call __local_bh_enable_ip #
So we end up with one additional opcode per usage and I can't tell how
bad it is. The second invocation (dec) was interleaved so it might use
idle cycles. Instead of one optimized operation we get two and the
pointer can't be cached.
And in case you ask, the task version looks like this:
| addl $512, %gs:pcpu_hot+8(%rip) #, *_47
local_bh_disable()
| movq %gs:const_pcpu_hot(%rip), %r14 # const_pcpu_hot.D.2663.D.2661.current_task, _44
| movzwl 2426(%r14), %eax # MEM[(struct netdev_xmit *)_44 + 2426B].recursion, _45
| leal 1(%rax), %edx #, tmp140
| movw %dx, 2426(%r14) # tmp140, MEM[(struct netdev_xmit *)_44 + 2426B].recursion
four opcodes for the inc.
| testb $16, 185(%rbx) #, dev_24->features
| je .L3311 #,
| movl $16, %r13d #, <retval>
| testb $5, 208(%r12) #, MEM[(const struct netdev_queue *)_56].state
| je .L3291 #,
| movw %ax, 2426(%r14) # _45, MEM[(struct netdev_xmit *)_44 + 2426B].recursion
but then gcc recycles the initial value. It reloads the value and
decrements it in case it calls the function.
| movl $512, %esi #,
| lea 0(%rip), %rdi # __here
| call __local_bh_enable_ip #
|
Any update request?
Sebastian