Re: [PATCH net-next, 3/4] hv_netvsc: Add XDP support

From: Jakub Kicinski
Date: Mon Oct 28 2019 - 17:33:32 EST


On Mon, 28 Oct 2019 21:07:04 +0000, Haiyang Zhang wrote:
> This patch adds support of XDP in native mode for hv_netvsc driver, and
> transparently sets the XDP program on the associated VF NIC as well.
>
> XDP program cannot run with LRO (RSC) enabled, so you need to disable LRO
> before running XDP:
> ethtool -K eth0 lro off
>
> XDP actions not yet supported:
> XDP_TX, XDP_REDIRECT

I don't think we want to merge support without at least XDP_TX these
days..

And without the ability to prepend headers this may be the least
complete initial XDP implementation we've seen :(

> Signed-off-by: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>

> diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
> index d22a36f..688487b 100644
> --- a/drivers/net/hyperv/netvsc.c
> +++ b/drivers/net/hyperv/netvsc.c
> @@ -122,8 +122,10 @@ static void free_netvsc_device(struct rcu_head *head)
> vfree(nvdev->send_buf);
> kfree(nvdev->send_section_map);
>
> - for (i = 0; i < VRSS_CHANNEL_MAX; i++)
> + for (i = 0; i < VRSS_CHANNEL_MAX; i++) {
> + xdp_rxq_info_unreg(&nvdev->chan_table[i].xdp_rxq);
> vfree(nvdev->chan_table[i].mrc.slots);
> + }
>
> kfree(nvdev);
> }
> @@ -1370,6 +1372,10 @@ struct netvsc_device *netvsc_device_add(struct hv_device *device,
> nvchan->net_device = net_device;
> u64_stats_init(&nvchan->tx_stats.syncp);
> u64_stats_init(&nvchan->rx_stats.syncp);
> +
> + xdp_rxq_info_reg(&nvchan->xdp_rxq, ndev, i);
> + xdp_rxq_info_reg_mem_model(&nvchan->xdp_rxq,
> + MEM_TYPE_PAGE_SHARED, NULL);

These can fail.

> }
>
> /* Enable NAPI handler before init callbacks */
> diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c
> new file mode 100644
> index 0000000..4d235ac
> --- /dev/null
> +++ b/drivers/net/hyperv/netvsc_bpf.c
> @@ -0,0 +1,211 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2019, Microsoft Corporation.
> + *
> + * Author:
> + * Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/netdevice.h>
> +#include <linux/etherdevice.h>
> +#include <linux/ethtool.h>
> +#include <linux/bpf.h>
> +#include <linux/bpf_trace.h>
> +#include <linux/kernel.h>
> +#include <net/xdp.h>
> +
> +#include <linux/mutex.h>
> +#include <linux/rtnetlink.h>
> +
> +#include "hyperv_net.h"
> +
> +u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
> + void **p_pbuf)
> +{
> + struct page *page = NULL;
> + void *data = nvchan->rsc.data[0];
> + u32 len = nvchan->rsc.len[0];
> + void *pbuf = data;
> + struct bpf_prog *prog;
> + struct xdp_buff xdp;
> + u32 act = XDP_PASS;
> +
> + *p_pbuf = NULL;
> +
> + rcu_read_lock();
> + prog = rcu_dereference(nvchan->bpf_prog);
> +
> + if (!prog || nvchan->rsc.cnt > 1)

Can rsc.cnt == 1 not be ensured at setup time? This looks quite
limiting if random frames could be forced to bypass the filter.

> + goto out;
> +
> + /* copy to a new page buffer if data are not within a page */
> + if (virt_to_page(data) != virt_to_page(data + len - 1)) {
> + page = alloc_page(GFP_ATOMIC);
> + if (!page)
> + goto out;

Returning XDP_PASS on allocation failure seems highly questionable.

> + pbuf = page_address(page);
> + memcpy(pbuf, nvchan->rsc.data[0], len);
> +
> + *p_pbuf = pbuf;
> + }
> +
> + xdp.data_hard_start = pbuf;
> + xdp.data = xdp.data_hard_start;

This patch also doesn't add any headroom for XDP to prepend data :(

> + xdp_set_data_meta_invalid(&xdp);
> + xdp.data_end = xdp.data + len;
> + xdp.rxq = &nvchan->xdp_rxq;
> + xdp.handle = 0;
> +
> + act = bpf_prog_run_xdp(prog, &xdp);
> +
> + switch (act) {
> + case XDP_PASS:
> + /* Pass to upper layers */
> + break;
> +
> + case XDP_ABORTED:
> + trace_xdp_exception(ndev, prog, act);
> + break;
> +
> + case XDP_DROP:
> + break;
> +
> + default:
> + bpf_warn_invalid_xdp_action(act);
> + }
> +
> +out:
> + rcu_read_unlock();
> +
> + if (page && act != XDP_PASS) {
> + *p_pbuf = NULL;
> + __free_page(page);
> + }
> +
> + return act;
> +}
> +
> +unsigned int netvsc_xdp_fraglen(unsigned int len)
> +{
> + return SKB_DATA_ALIGN(len) +
> + SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +}
> +
> +struct bpf_prog *netvsc_xdp_get(struct netvsc_device *nvdev)
> +{
> + return rtnl_dereference(nvdev->chan_table[0].bpf_prog);
> +}
> +
> +int netvsc_xdp_set(struct net_device *dev, struct bpf_prog *prog,
> + struct netvsc_device *nvdev)
> +{
> + struct bpf_prog *old_prog;
> + int frag_max, i;
> +
> + old_prog = netvsc_xdp_get(nvdev);
> +
> + if (!old_prog && !prog)
> + return 0;

I think this case is now handled by the core.

> + frag_max = netvsc_xdp_fraglen(dev->mtu + ETH_HLEN);
> + if (prog && frag_max > PAGE_SIZE) {
> + netdev_err(dev, "XDP: mtu:%u too large, frag:%u\n",
> + dev->mtu, frag_max);
> + return -EOPNOTSUPP;
> + }
> +
> + if (prog && (dev->features & NETIF_F_LRO)) {
> + netdev_err(dev, "XDP: not support LRO\n");

Please report this via extack, that way users will see it in the console
in which they're installing the program.

> + return -EOPNOTSUPP;
> + }
> +
> + if (prog) {
> + prog = bpf_prog_add(prog, nvdev->num_chn);
> + if (IS_ERR(prog))
> + return PTR_ERR(prog);
> + }
> +
> + for (i = 0; i < nvdev->num_chn; i++)
> + rcu_assign_pointer(nvdev->chan_table[i].bpf_prog, prog);
> +
> + if (old_prog)
> + for (i = 0; i < nvdev->num_chn; i++)
> + bpf_prog_put(old_prog);
> +
> + return 0;
> +}
> +
> +int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog)
> +{
> + struct netdev_bpf xdp;
> + bpf_op_t ndo_bpf;
> +
> + ASSERT_RTNL();
> +
> + if (!vf_netdev)
> + return 0;
> +
> + ndo_bpf = vf_netdev->netdev_ops->ndo_bpf;
> + if (!ndo_bpf)
> + return 0;
> +
> + memset(&xdp, 0, sizeof(xdp));
> +
> + xdp.command = XDP_SETUP_PROG;
> + xdp.prog = prog;
> +
> + return ndo_bpf(vf_netdev, &xdp);

IMHO the automatic propagation is not a good idea. Especially if the
propagation doesn't make the entire installation fail if VF doesn't
have ndo_bpf.

> +}