Re: [PATCH net v1 2/2] net: gro: add p_off param in *_gro_complete
From: Richard Gobert
Date: Wed Apr 17 2024 - 09:48:50 EST
Willem de Bruijn wrote:
> Richard Gobert wrote:
>> Commits a602456 ("udp: Add GRO functions to UDP socket") and 57c67ff ("udp:
>> additional GRO support") introduce incorrect usage of {ip,ipv6}_hdr in the
>> complete phase of gro. The functions always return skb->network_header,
>> which in the case of encapsulated packets at the gro complete phase, is
>> always set to the innermost L3 of the packet. That means that calling
>> {ip,ipv6}_hdr for skbs which completed the GRO receive phase (both in
>> gro_list and *_gro_complete) when parsing an encapsulated packet's _outer_
>> L3/L4 may return an unexpected value.
>>
>> This incorrect usage leads to a bug in GRO's UDP socket lookup.
>> udp{4,6}_lib_lookup_skb functions use ip_hdr/ipv6_hdr respectively. These
>> *_hdr functions return network_header which will point to the innermost L3,
>> resulting in the wrong offset being used in __udp{4,6}_lib_lookup with
>> encapsulated packets.
>>
>> To fix this issue p_off param is used in *_gro_complete to pass off the
>> offset of the previous layer.
>>
>> Reproduction example:
>>
>> Endpoint configuration example (fou + local address bind)
>>
>> # ip fou add port 6666 ipproto 4
>> # ip link add name tun1 type ipip remote 2.2.2.1 local 2.2.2.2 encap fou encap-dport 5555 encap-sport 6666 mode ipip
>> # ip link set tun1 up
>> # ip a add 1.1.1.2/24 dev tun1
>>
>> Netperf TCP_STREAM result on net-next before patch is applied:
>>
>> net-next main, GRO enabled:
>> $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
>> Recv Send Send
>> Socket Socket Message Elapsed
>> Size Size Size Time Throughput
>> bytes bytes bytes secs. 10^6bits/sec
>>
>> 131072 16384 16384 5.28 2.37
>>
>> net-next main, GRO disabled:
>> $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
>> Recv Send Send
>> Socket Socket Message Elapsed
>> Size Size Size Time Throughput
>> bytes bytes bytes secs. 10^6bits/sec
>>
>> 131072 16384 16384 5.01 2745.06
>>
>> patch applied, GRO enabled:
>> $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
>> Recv Send Send
>> Socket Socket Message Elapsed
>> Size Size Size Time Throughput
>> bytes bytes bytes secs. 10^6bits/sec
>>
>> 131072 16384 16384 5.01 2877.38
>>
>> Fixes: 57c67ff4bd92 ("udp: additional GRO support")
>> Suggested-by: Eric Dumazet <edumazet@xxxxxxxxxx>
>> Signed-off-by: Richard Gobert <richardbgobert@xxxxxxxxx>
>
>> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
>> index 163f94a5a58f..9c18a39b0d0c 100644
>> --- a/drivers/net/geneve.c
>> +++ b/drivers/net/geneve.c
>> @@ -555,7 +555,7 @@ static struct sk_buff *geneve_gro_receive(struct sock *sk,
>> }
>>
>> static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
>> - int nhoff)
>> + int p_off, int nhoff)
>> {
>> struct genevehdr *gh;
>> struct packet_offload *ptype;
>> @@ -569,11 +569,12 @@ static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
>>
>> /* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */
>> if (likely(type == htons(ETH_P_TEB)))
>> - return eth_gro_complete(skb, nhoff + gh_len);
>> + return eth_gro_complete(skb, p_off, nhoff + gh_len);
>
> Since the new field to the callback is only used between IP and
> transport layer callback implementations, I think the others should
> just return zero, to make it clear that the value is unused.
>
Got it. I'll push it in v2.
> I still think that if the only issue is with udp, we can just special
> case those and pass the nhoff instead of thoff in the existing one
> available offset field, and compute the transport offset in the udp
> function. For much less code churn. But unless anyone else agrees you
> can ignore that suggestion.
>
>> -int inet_gro_complete(struct sk_buff *skb, int nhoff)
>> +int inet_gro_complete(struct sk_buff *skb, int prior_off, int nhoff)
>> {
>> struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
>> const struct net_offload *ops;
>> @@ -1667,17 +1667,17 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
>> */
>> err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
>> tcp4_gro_complete, udp4_gro_complete,
>> - skb, nhoff + sizeof(*iph));
>> + skb, nhoff, nhoff + sizeof(*iph));
>
> Identation change
>
Will fix in v2 as well, thanks!
>> struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
>> + int nhoff,
>> __be16 sport, __be16 dport)
>> {
>> - const struct iphdr *iph = ip_hdr(skb);
>> + const struct iphdr *iph = (const struct iphdr *)(skb->data + nhoff);
>
> How about instead just pass the saddr and daddr and leave the iph
> pointer to the caller (which also computes the udph pointer).
Here's a snippet I wrote - could you make sure this is what you mean?
BTW I couldn't find a union type which contains both in_addr and in6_addr
and is generic enough (I wrote udp_offload_addr which is similar to
tcp_ao_addr, is there a more generic one? Should it be created?)
diff --git a/include/net/gro.h b/include/net/gro.h
index ebead1d642b4..56e5e21feb00 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -405,7 +405,8 @@ INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int, int));
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct udphdr *uh, struct sock *sk);
-int udp_gro_complete(struct sk_buff *skb, int nhoff, int thoff,
+int udp_gro_complete(struct sk_buff *skb, const union udp_offload_addr *saddr,
+ const union udp_offload_addr *daddr, int thoff,
udp_lookup_t lookup);
static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
diff --git a/include/net/udp.h b/include/net/udp.h
index 601d1c3b677a..5f7224e6eb1e 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -62,6 +62,11 @@ struct udp_hslot {
spinlock_t lock;
} __attribute__((aligned(2 * sizeof(long))));
+union udp_offload_addr {
+ __be32 ipaddr;
+ struct in6_addr ip6addr;
+};
+
/**
* struct udp_table - UDP table
*
@@ -166,7 +171,9 @@ static inline void udp_csum_pull_header(struct sk_buff *skb)
UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
}
-typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, int nhoff,
+typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb,
+ const union udp_offload_addr *saddr,
+ const union udp_offload_addr *daddr,
__be16 sport, __be16 dport);
void udp_v6_early_demux(struct sk_buff *skb);
@@ -301,7 +308,9 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif, int sdif,
struct udp_table *tbl, struct sk_buff *skb);
-struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, int nhoff,
+struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
+ const union udp_offload_addr *saddr,
+ const union udp_offload_addr *daddr,
__be16 sport, __be16 dport);
struct sock *udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
@@ -312,7 +321,9 @@ struct sock *__udp6_lib_lookup(struct net *net,
const struct in6_addr *daddr, __be16 dport,
int dif, int sdif, struct udp_table *tbl,
struct sk_buff *skb);
-struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, int nhoff,
+struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
+ const union udp_offload_addr *saddr,
+ const union udp_offload_addr *daddr,
__be16 sport, __be16 dport);
int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 306d2a78fefa..151c3adecc21 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -529,17 +529,18 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
inet_sdif(skb), udptable, skb);
}
-struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, int nhoff,
+struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
+ const union udp_offload_addr *saddr,
+ const union udp_offload_addr *daddr,
__be16 sport, __be16 dport)
{
- const struct iphdr *iph = (const struct iphdr *)(skb->data + nhoff);
struct net *net = dev_net(skb->dev);
int iif, sdif;
inet_get_iif_sdif(skb, &iif, &sdif);
- return __udp4_lib_lookup(net, iph->saddr, sport,
- iph->daddr, dport, iif,
+ return __udp4_lib_lookup(net, saddr->ipaddr, sport,
+ daddr->ipaddr, dport, iif,
sdif, net->ipv4.udp_table, NULL);
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index dcf8124b1a6a..0f05c7ed05d3 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -689,7 +689,8 @@ static int udp_gro_complete_segment(struct sk_buff *skb)
return 0;
}
-int udp_gro_complete(struct sk_buff *skb, int nhoff, int thoff,
+int udp_gro_complete(struct sk_buff *skb, const union udp_offload_addr *saddr,
+ const union udp_offload_addr *daddr, int thoff,
udp_lookup_t lookup)
{
struct udphdr *uh = (struct udphdr *)(skb->data + thoff);
@@ -700,8 +701,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, int thoff,
uh->len = newlen;
sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
- udp4_lib_lookup_skb, skb, nhoff, uh->source,
- uh->dest);
+ udp4_lib_lookup_skb, skb, saddr, daddr,
+ uh->source, uh->dest);
if (sk && udp_sk(sk)->gro_complete) {
skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
: SKB_GSO_UDP_TUNNEL;
@@ -733,6 +734,8 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff,
{
const struct iphdr *iph = (const struct iphdr *)(skb->data + nhoff);
struct udphdr *uh = (struct udphdr *)(skb->data + thoff);
+ const union udp_offload_addr *saddr = (union udp_offload_addr *)&iph->saddr;
+ const union udp_offload_addr *daddr = (union udp_offload_addr *)&iph->daddr;
/* do fraglist only if there is no outer UDP encap (or we already processed it) */
if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
@@ -750,7 +753,7 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff,
uh->check = ~udp_v4_check(skb->len - thoff, iph->saddr,
iph->daddr, 0);
- return udp_gro_complete(skb, nhoff, thoff, udp4_lib_lookup_skb);
+ return udp_gro_complete(skb, saddr, daddr, thoff, udp4_lib_lookup_skb);
}
int __init udpv4_offload_init(void)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 74d4a0e4d754..124beb4f4ea5 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -270,17 +270,18 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
inet6_sdif(skb), udptable, skb);
}
-struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, int nhoff,
+struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
+ const union udp_offload_addr *saddr,
+ const union udp_offload_addr *daddr,
__be16 sport, __be16 dport)
{
- const struct ipv6hdr *iph = (const struct ipv6hdr *)(skb->data + nhoff);
struct net *net = dev_net(skb->dev);
int iif, sdif;
inet6_get_iif_sdif(skb, &iif, &sdif);
- return __udp6_lib_lookup(net, &iph->saddr, sport,
- &iph->daddr, dport, iif,
+ return __udp6_lib_lookup(net, &saddr->ip6addr, sport,
+ &daddr->ip6addr, dport, iif,
sdif, net->ipv4.udp_table, NULL);
}
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 078055665397..651ec7ade1af 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -167,6 +167,8 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff,
{
const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)(skb->data + nhoff);
struct udphdr *uh = (struct udphdr *)(skb->data + thoff);
+ const union udp_offload_addr *saddr = (union udp_offload_addr *)&ipv6h->saddr;
+ const union udp_offload_addr *daddr = (union udp_offload_addr *)&ipv6h->daddr;
/* do fraglist only if there is no outer UDP encap (or we already processed it) */
if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
@@ -181,10 +183,10 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff,
}
if (uh->check)
- uh->check = ~udp_v6_check(skb->len - thoff, &ipv6h->saddr,
- &ipv6h->daddr, 0);
+ uh->check = ~udp_v6_check(skb->len - thoff, &saddr->ip6addr,
+ &daddr->ip6addr, 0);
- return udp_gro_complete(skb, nhoff, thoff, udp6_lib_lookup_skb);
+ return udp_gro_complete(skb, saddr, daddr, thoff, udp6_lib_lookup_skb);
}
int __init udpv6_offload_init(void)