Re: [PATCH v5 net-next 14/15] net: Reference bpf_redirect_info via task_struct on PREEMPT_RT.

From: Jesper Dangaard Brouer
Date: Fri Jun 07 2024 - 07:51:49 EST



On 07/06/2024 08.53, Sebastian Andrzej Siewior wrote:
[...]

Create a struct bpf_net_context which contains struct bpf_redirect_info.
Define the variable on stack, use bpf_net_ctx_set() to save a pointer to
it, bpf_net_ctx_clear() removes it again.
The bpf_net_ctx_set() may nest. For instance a function can be used from
within NET_RX_SOFTIRQ/ net_rx_action which uses bpf_net_ctx_set() and
NET_TX_SOFTIRQ which does not. Therefore only the first invocations
updates the pointer.
Use bpf_net_ctx_get_ri() as a wrapper to retrieve the current struct
bpf_redirect_info.

The pointer to bpf_net_context is saved task's task_struct. Using
always the bpf_net_context approach has the advantage that there is
almost zero differences between PREEMPT_RT and non-PREEMPT_RT builds.

[...]
---
include/linux/filter.h | 43 ++++++++++++++++++++++++++++++++++-------
include/linux/sched.h | 3 +++
kernel/bpf/cpumap.c | 3 +++
kernel/bpf/devmap.c | 9 ++++++++-
kernel/fork.c | 1 +
net/bpf/test_run.c | 11 ++++++++++-
net/core/dev.c | 26 ++++++++++++++++++++++++-
net/core/filter.c | 44 ++++++++++++------------------------------
net/core/lwt_bpf.c | 3 +++
9 files changed, 101 insertions(+), 42 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index b02aea291b7e8..2ff1c394dcf0c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -744,7 +744,38 @@ struct bpf_redirect_info {
struct bpf_nh_params nh;
};
-DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
+struct bpf_net_context {
+ struct bpf_redirect_info ri;
+};
+
+static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
+{
+ struct task_struct *tsk = current;
+
+ if (tsk->bpf_net_context != NULL)
+ return NULL;
+ memset(&bpf_net_ctx->ri, 0, sizeof(bpf_net_ctx->ri));

It annoys me that we have to clear this memory every time.
(This is added in net_rx_action() that *all* RX packets traverse).

The feature and memory is only/primarily used for XDP and TC redirects,
but we take the overhead of clearing even when these features are not used.

Netstack does bulking in most of the cases this is used, so in our/your
benchmarks this overhead doesn't show. But we need to be aware that
this is a "paper-cut" for single network packet processing.

Idea: We could postpone clearing until code calls bpf_net_ctx_get() ?
See below.

+ tsk->bpf_net_context = bpf_net_ctx;
+ return bpf_net_ctx;
+}
+
+static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
+{
+ if (bpf_net_ctx)
+ current->bpf_net_context = NULL;
+}
+
+static inline struct bpf_net_context *bpf_net_ctx_get(void)
+{

+ return current->bpf_net_context;
+}
+
+static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
+{
+ struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
+

if (bpf_net_ctx->ri->kern_flags & BPF_RI_F_NEEDS_INIT) {
memset + init_list (intro in patch 15)
}

Maybe even postpone the init_list calls to the "get" helpers introduced in patch 15.


+ return &bpf_net_ctx->ri;
+}
[...]

diff --git a/net/core/dev.c b/net/core/dev.c
index 2c3f86c8cd176..73965dff1b30f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
[...]
@@ -6881,10 +6902,12 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)

The function net_rx_action() is core to the network stack.

struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int budget = READ_ONCE(net_hotdata.netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
sd->in_net_rx_action = true;
local_irq_disable();
@@ -6937,7 +6960,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
sd->in_net_rx_action = false;
net_rps_action_and_irq_enable(sd);
-end:;
+end:
+ bpf_net_ctx_clear(bpf_net_ctx);
}


The memset can be further optimized as it currently clears 64 bytes, but
it only need to clear 40 bytes, see pahole below.

Replace memset with something like:
memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));

This is an optimization, because with 64 bytes this result in a rep-stos
(repeated string store operation) that on Intel touch CPU-flags (to be
IRQ safe) which is slow, while clearing 40 bytes doesn't cause compiler
to use this instruction, which is faster. Memset benchmarked with [1]

[1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_memset.c

--Jesper

$ pahole -C bpf_redirect_info vmlinux
struct bpf_redirect_info {
u64 tgt_index; /* 0 8 */
void * tgt_value; /* 8 8 */
struct bpf_map * map; /* 16 8 */
u32 flags; /* 24 4 */
u32 kern_flags; /* 28 4 */
u32 map_id; /* 32 4 */
enum bpf_map_type map_type; /* 36 4 */
struct bpf_nh_params nh; /* 40 20 */

/* size: 64, cachelines: 1, members: 8 */
/* padding: 4 */
};



The full struct:

$ pahole -C bpf_net_context vmlinux
struct bpf_net_context {
struct bpf_redirect_info ri; /* 0 64 */

/* XXX last struct has 4 bytes of padding */

/* --- cacheline 1 boundary (64 bytes) --- */
struct list_head cpu_map_flush_list; /* 64 16 */
struct list_head dev_map_flush_list; /* 80 16 */
struct list_head xskmap_map_flush_list; /* 96 16 */

/* size: 112, cachelines: 2, members: 4 */
/* paddings: 1, sum paddings: 4 */
/* last cacheline: 48 bytes */
};