Re: [PATCH/v4] bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH

From: Willem de Bruijn
Date: Wed Mar 03 2021 - 14:37:35 EST


On Wed, Mar 3, 2021 at 7:33 AM Xuesen Huang <hxseverything@xxxxxxxxx> wrote:
>
> From: Xuesen Huang <huangxuesen@xxxxxxxxxxxx>
>
> bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets
> encapsulation. But that is not appropriate when pushing Ethernet header.
>
> Add an option to further specify encap L2 type and set the inner_protocol
> as ETH_P_TEB.
>
> Update test_tc_tunnel to verify adding vxlan encapsulation works with
> this flag.
>
> Suggested-by: Willem de Bruijn <willemb@xxxxxxxxxx>
> Signed-off-by: Xuesen Huang <huangxuesen@xxxxxxxxxxxx>
> Signed-off-by: Zhiyong Cheng <chengzhiyong@xxxxxxxxxxxx>
> Signed-off-by: Li Wang <wangli09@xxxxxxxxxxxx>

Thanks for adding the test. Perhaps that is better in a separate patch?

Overall looks great to me.

The patch has not (yet?) arrived on patchwork.

> enum {
> diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> index 37bce7a..6e144db 100644
> --- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> +++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
> @@ -20,6 +20,14 @@
> #include <bpf/bpf_endian.h>
> #include <bpf/bpf_helpers.h>
>
> +#define encap_ipv4(...) __encap_ipv4(__VA_ARGS__, 0)
> +
> +#define encap_ipv4_with_ext_proto(...) __encap_ipv4(__VA_ARGS__)
> +
> +#define encap_ipv6(...) __encap_ipv6(__VA_ARGS__, 0)
> +
> +#define encap_ipv6_with_ext_proto(...) __encap_ipv6(__VA_ARGS__)
> +

Instead of untyped macros, I'd define encap_ipv4 as a function that
calls __encap_ipv4.

And no need for encap_ipv4_with_ext_proto equivalent to __encap_ipv4.

> static const int cfg_port = 8000;
>
> static const int cfg_udp_src = 20000;
> @@ -27,11 +35,24 @@
> #define UDP_PORT 5555
> #define MPLS_OVER_UDP_PORT 6635
> #define ETH_OVER_UDP_PORT 7777
> +#define VXLAN_UDP_PORT 8472
> +
> +#define EXTPROTO_VXLAN 0x1
> +
> +#define VXLAN_N_VID (1u << 24)
> +#define VXLAN_VNI_MASK bpf_htonl((VXLAN_N_VID - 1) << 8)
> +#define VXLAN_FLAGS 0x8
> +#define VXLAN_VNI 1
>
> /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
> static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
> MPLS_LS_S_MASK | 0xff);
>
> +struct vxlanhdr {
> + __be32 vx_flags;
> + __be32 vx_vni;
> +} __attribute__((packed));
> +
> struct gre_hdr {
> __be16 flags;
> __be16 protocol;
> @@ -45,13 +66,13 @@ struct gre_hdr {
> struct v4hdr {
> struct iphdr ip;
> union l4hdr l4hdr;
> - __u8 pad[16]; /* enough space for L2 header */
> + __u8 pad[24]; /* space for L2 header / vxlan header ... */

could we use something like sizeof(..) instead of a constant?

> @@ -171,14 +197,26 @@ static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
> }
>
> /* add L2 encap (if specified) */
> + l2_hdr = (__u8 *)&h_outer + olen;
> switch (l2_proto) {
> case ETH_P_MPLS_UC:
> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> + *(__u32 *)l2_hdr = mpls_label;
> break;
> case ETH_P_TEB:
> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> - ETH_HLEN))

This is non-standard indentation? Here and elsewhere.

> @@ -249,7 +288,11 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> break;
> case ETH_P_TEB:
> l2_len = ETH_HLEN;
> - udp_dst = ETH_OVER_UDP_PORT;
> + if (ext_proto & EXTPROTO_VXLAN) {
> + udp_dst = VXLAN_UDP_PORT;
> + l2_len += sizeof(struct vxlanhdr);
> + } else
> + udp_dst = ETH_OVER_UDP_PORT;
> break;
> }
> flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
> @@ -267,7 +310,7 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
> h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
> tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
> - sizeof(h_outer.l4hdr.udp);
> + sizeof(h_outer.l4hdr.udp) + l2_len;

Was this a bug previously?

> h_outer.l4hdr.udp.check = 0;
> h_outer.l4hdr.udp.len = bpf_htons(tot_len);
> break;
> @@ -278,13 +321,24 @@ static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
> }
>
> /* add L2 encap (if specified) */
> + l2_hdr = (__u8 *)&h_outer + olen;
> switch (l2_proto) {
> case ETH_P_MPLS_UC:
> - *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
> + *(__u32 *)l2_hdr = mpls_label;
> break;
> case ETH_P_TEB:
> - if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
> - ETH_HLEN))
> + flags |= BPF_F_ADJ_ROOM_ENCAP_L2_ETH;

This is a change also for the existing case. Correctly so, I imagine.
But the test used to pass with the wrong protocol?