[RFC net-next 1/6] net: add kfree_skb_for_sk function

From: Yan Zhai
Date: Thu May 30 2024 - 17:47:28 EST


Implement a new kfree_skb_for_sk to replace kfree_skb_reason on a few
local receive path. The function accepts an extra receiving socket
argument, which will be set in skb->cb for kfree_skb/consume_skb
tracepoint consumption. With this extra bit of information, it will be
easier to attribute dropped packets to netns/containers and
sockets/services for performance and error monitoring purpose.

Signed-off-by: Yan Zhai <yan@xxxxxxxxxxxxxx>
---
include/linux/skbuff.h | 48 ++++++++++++++++++++++++++++++++++++++++--
net/core/dev.c | 21 +++++++-----------
net/core/skbuff.c | 29 +++++++++++++------------
3 files changed, 70 insertions(+), 28 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe7d8dbef77e..66f5b06798f2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1251,8 +1251,52 @@ static inline bool skb_data_unref(const struct sk_buff *skb,
return true;
}

-void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason);
+/*
+ * Some protocol will clear or reuse skb->dev field for other purposes. For
+ * example, TCP stack would reuse the pointer for out of order packet handling.
+ * This caused some problem for drop monitoring on kfree_skb tracepoint, since
+ * no other fields of an skb provides netns information. In addition, it is
+ * also complicated to recover receive socket information for dropped packets,
+ * because the socket lookup can be an sk-lookup BPF program.
+ *
+ * This can be addressed by just passing the rx socket to the tracepoint,
+ * because it also has valid netns binding.
+ */
+struct kfree_skb_cb {
+ enum skb_drop_reason reason; /* used only by dev_kfree_skb_irq */
+ struct sock *rx_sk;
+};
+
+#define KFREE_SKB_CB(skb) ((struct kfree_skb_cb *)(skb)->cb)
+
+/* Save cb->rx_sk before calling kfree_skb/consume_skb tracepoint, and restore
+ * after the tracepoint. This is necessary because some skb destructor might
+ * rely on values in skb->cb, e.g. unix_destruct_scm.
+ */
+#define _call_trace_kfree_skb(action, skb, sk, ...) do { \
+ if (trace_##action##_skb_enabled()) { \
+ struct kfree_skb_cb saved; \
+ saved.rx_sk = KFREE_SKB_CB(skb)->rx_sk; \
+ KFREE_SKB_CB(skb)->rx_sk = sk; \
+ trace_##action##_skb((skb), ## __VA_ARGS__); \
+ KFREE_SKB_CB(skb)->rx_sk = saved.rx_sk; \
+ } \
+} while (0)
+
+#define call_trace_kfree_skb(skb, sk, ...) \
+ _call_trace_kfree_skb(kfree, skb, sk, ## __VA_ARGS__)
+
+#define call_trace_consume_skb(skb, sk, ...) \
+ _call_trace_kfree_skb(consume, skb, sk, ## __VA_ARGS__)
+
+void __fix_address kfree_skb_for_sk(struct sk_buff *skb, struct sock *rx_sk,
+ enum skb_drop_reason reason);
+
+static inline void
+kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+{
+ kfree_skb_for_sk(skb, NULL, reason);
+}

/**
* kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
diff --git a/net/core/dev.c b/net/core/dev.c
index 85fe8138f3e4..17516f26be92 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3135,15 +3135,6 @@ void __netif_schedule(struct Qdisc *q)
}
EXPORT_SYMBOL(__netif_schedule);

-struct dev_kfree_skb_cb {
- enum skb_drop_reason reason;
-};
-
-static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
-{
- return (struct dev_kfree_skb_cb *)skb->cb;
-}
-
void netif_schedule_queue(struct netdev_queue *txq)
{
rcu_read_lock();
@@ -3182,7 +3173,11 @@ void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
} else if (likely(!refcount_dec_and_test(&skb->users))) {
return;
}
- get_kfree_skb_cb(skb)->reason = reason;
+
+ /* There is no need to save the old cb since we are the only user. */
+ KFREE_SKB_CB(skb)->reason = reason;
+ KFREE_SKB_CB(skb)->rx_sk = NULL;
+
local_irq_save(flags);
skb->next = __this_cpu_read(softnet_data.completion_queue);
__this_cpu_write(softnet_data.completion_queue, skb);
@@ -5229,17 +5224,17 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
clist = clist->next;

WARN_ON(refcount_read(&skb->users));
- if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
+ if (likely(KFREE_SKB_CB(skb)->reason == SKB_CONSUMED))
trace_consume_skb(skb, net_tx_action);
else
trace_kfree_skb(skb, net_tx_action,
- get_kfree_skb_cb(skb)->reason);
+ KFREE_SKB_CB(skb)->reason);

if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
__napi_kfree_skb(skb,
- get_kfree_skb_cb(skb)->reason);
+ KFREE_SKB_CB(skb)->reason);
}
}

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 466999a7515e..5ce6996512a1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1190,7 +1190,8 @@ void __kfree_skb(struct sk_buff *skb)
EXPORT_SYMBOL(__kfree_skb);

static __always_inline
-bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+bool __kfree_skb_reason(struct sk_buff *skb, struct sock *rx_sk,
+ enum skb_drop_reason reason)
{
if (unlikely(!skb_unref(skb)))
return false;
@@ -1201,28 +1202,30 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
SKB_DROP_REASON_SUBSYS_NUM);

if (reason == SKB_CONSUMED)
- trace_consume_skb(skb, __builtin_return_address(0));
+ call_trace_consume_skb(skb, rx_sk, __builtin_return_address(0));
else
- trace_kfree_skb(skb, __builtin_return_address(0), reason);
+ call_trace_kfree_skb(skb, rx_sk, __builtin_return_address(0), reason);
+
return true;
}

/**
- * kfree_skb_reason - free an sk_buff with special reason
+ * kfree_skb_for_sk - free an sk_buff with special reason and receiving socket
* @skb: buffer to free
+ * @rx_sk: the socket to receive the buffer, or NULL if not applicable
* @reason: reason why this skb is dropped
*
* Drop a reference to the buffer and free it if the usage count has
- * hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
+ * hit zero. Meanwhile, pass the drop reason and rx socket to 'kfree_skb'
* tracepoint.
*/
-void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+void __fix_address kfree_skb_for_sk(struct sk_buff *skb, struct sock *rx_sk,
+ enum skb_drop_reason reason)
{
- if (__kfree_skb_reason(skb, reason))
+ if (__kfree_skb_reason(skb, rx_sk, reason))
__kfree_skb(skb);
}
-EXPORT_SYMBOL(kfree_skb_reason);
+EXPORT_SYMBOL(kfree_skb_for_sk);

#define KFREE_SKB_BULK_SIZE 16

@@ -1261,7 +1264,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
while (segs) {
struct sk_buff *next = segs->next;

- if (__kfree_skb_reason(segs, reason)) {
+ if (__kfree_skb_reason(segs, NULL, reason)) {
skb_poison_list(segs);
kfree_skb_add_bulk(segs, &sa, reason);
}
@@ -1405,7 +1408,7 @@ void consume_skb(struct sk_buff *skb)
if (!skb_unref(skb))
return;

- trace_consume_skb(skb, __builtin_return_address(0));
+ call_trace_consume_skb(skb, NULL, __builtin_return_address(0));
__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
@@ -1420,7 +1423,7 @@ EXPORT_SYMBOL(consume_skb);
*/
void __consume_stateless_skb(struct sk_buff *skb)
{
- trace_consume_skb(skb, __builtin_return_address(0));
+ call_trace_consume_skb(skb, NULL, __builtin_return_address(0));
skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb);
}
@@ -1478,7 +1481,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;

/* if reaching here SKB is ready to free */
- trace_consume_skb(skb, __builtin_return_address(0));
+ call_trace_consume_skb(skb, NULL, __builtin_return_address(0));

/* if SKB is a clone, don't handle this case */
if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
--
2.30.2