[RFC net-next 1/2] Introduce an eBPF hookpoint for tx queue selection in the XPS (Transmit Packet Steering) code.

From: Matthew Cover
Date: Thu Sep 19 2019 - 18:45:56 EST


WORK IN PROGRESS:
* bpf program loading works!
* txq steering via bpf program return code works!
* bpf program unloading not working.
* bpf program attached query not working.
---
include/linux/netdevice.h | 3 +++
include/uapi/linux/if_link.h | 12 +++++++++
net/core/dev.c | 61 ++++++++++++++++++++++++++++++++++++-------
net/core/rtnetlink.c | 62 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 129 insertions(+), 9 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9eda1c3..88e37d5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1966,6 +1966,7 @@ struct net_device {
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_cpus_map;
struct xps_dev_maps __rcu *xps_rxqs_map;
+ struct bpf_prog __rcu *xps_prog;
#endif
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
@@ -2147,6 +2148,8 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
struct sk_buff *skb,
struct net_device *sb_dev);

+int dev_change_xps_fd(struct net_device *dev, int fd);
+
/* returns the headroom that the master device needs to take in account
* when forwarding to this dev
*/
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 4a8c02c..a23d241 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -167,6 +167,7 @@ enum {
IFLA_NEW_IFINDEX,
IFLA_MIN_MTU,
IFLA_MAX_MTU,
+ IFLA_XPS,
__IFLA_MAX
};

@@ -979,6 +980,17 @@ enum {

#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1)

+/* XPS section */
+
+enum {
+ IFLA_XPS_UNSPEC,
+ IFLA_XPS_FD,
+ IFLA_XPS_ATTACHED,
+ __IFLA_XPS_MAX,
+};
+
+#define IFLA_XPS_MAX (__IFLA_XPS_MAX - 1)
+
enum {
IFLA_EVENT_NONE,
IFLA_EVENT_REBOOT, /* internal reset / reboot */
diff --git a/net/core/dev.c b/net/core/dev.c
index 71b18e8..a46d42b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3663,26 +3663,34 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
{
#ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
+ struct bpf_prog *prog;
struct sock *sk = skb->sk;
+ int bpf_ret = -1;
int queue_index = -1;

if (!static_key_false(&xps_needed))
return -1;

rcu_read_lock();
- if (!static_key_false(&xps_rxqs_needed))
- goto get_cpus_map;

- dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
- if (dev_maps) {
- int tci = sk_rx_queue_get(sk);
+ prog = rcu_dereference(dev->xps_prog);
+ if (prog) {
+ bpf_ret = bpf_prog_run_clear_cb(prog, skb);
+ if (bpf_ret >= 0)
+ queue_index = bpf_ret % dev->num_tx_queues;
+ }

- if (tci >= 0 && tci < dev->num_rx_queues)
- queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
- tci);
+ if (queue_index < 0 && static_key_false(&xps_rxqs_needed)) {
+ dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+ if (dev_maps) {
+ int tci = sk_rx_queue_get(sk);
+
+ if (tci >= 0 && tci < dev->num_rx_queues)
+ queue_index = __get_xps_queue_idx(dev, skb,
+ dev_maps, tci);
+ }
}

-get_cpus_map:
if (queue_index < 0) {
dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
if (dev_maps) {
@@ -8170,6 +8178,41 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
return err;
}

+static void dev_xps_install(struct net_device *dev, struct bpf_prog *prog)
+{
+#ifdef CONFIG_XPS
+ struct bpf_prog *old = rtnl_dereference(dev->xps_prog);
+ struct bpf_prog *new = prog;
+
+ rcu_assign_pointer(dev->xps_prog, new);
+ if (old)
+ bpf_prog_put(old);
+#endif
+}
+
+/**
+ * dev_change_xps_fd - set or clear a bpf program for tx queue selection for a device
+ * @dev: device
+ * @fd: new program fd or negative value to clear
+ *
+ * Set or clear a bpf program for a device
+ */
+int dev_change_xps_fd(struct net_device *dev, int fd)
+{
+ struct bpf_prog *prog = NULL;
+
+ ASSERT_RTNL();
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ dev_xps_install(dev, prog);
+
+ return 0;
+}
+
/**
* dev_new_index - allocate an ifindex
* @net: the applicable net namespace
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1ee6460..202b59a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -980,6 +980,15 @@ static size_t rtnl_xdp_size(void)
return xdp_size;
}

+static size_t rtnl_xps_size(void)
+{
+ size_t xps_size = nla_total_size(0) + /* nest IFLA_XPS */
+ nla_total_size(1) + /* XPS_ATTACHED */
+ nla_total_size(4); /* XPS_PROG_ID */
+
+ return xps_size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
@@ -1018,6 +1027,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+ rtnl_xdp_size() /* IFLA_XDP */
+ + rtnl_xps_size() /* IFLA_XPS */
+ nla_total_size(4) /* IFLA_EVENT */
+ nla_total_size(4) /* IFLA_NEW_NETNSID */
+ nla_total_size(4) /* IFLA_NEW_IFINDEX */
@@ -1455,6 +1465,31 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
return err;
}

+static int rtnl_xps_fill(struct sk_buff *skb, struct net_device *dev)
+{
+ struct nlattr *xps;
+ struct bpf_prog *xps_prog;
+ int err;
+
+ ASSERT_RTNL();
+
+ xps = nla_nest_start(skb, IFLA_XPS);
+ if (!xps)
+ return -EMSGSIZE;
+
+ xps_prog = rtnl_dereference(dev->xps_prog);
+ if (xps_prog) {
+ err = nla_put_u8(skb, IFLA_XPS_ATTACHED, xps_prog->aux->id);
+ if (err) {
+ nla_nest_cancel(skb, xps);
+ return err;
+ }
+ }
+
+ nla_nest_end(skb, xps);
+ return 0;
+}
+
static u32 rtnl_get_event(unsigned long event)
{
u32 rtnl_event_type = IFLA_EVENT_NONE;
@@ -1697,6 +1732,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure_rcu;
rcu_read_unlock();

+ if (rtnl_xps_fill(skb, dev))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;

@@ -1750,6 +1788,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
[IFLA_MIN_MTU] = { .type = NLA_U32 },
[IFLA_MAX_MTU] = { .type = NLA_U32 },
+ [IFLA_XPS] = { .type = NLA_NESTED },
};

static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1801,6 +1840,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
[IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
};

+static const struct nla_policy ifla_xps_policy[IFLA_XPS_MAX + 1] = {
+ [IFLA_XPS_FD] = { .type = NLA_S32 },
+ [IFLA_XPS_ATTACHED] = { .type = NLA_U8 },
+};
+
static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
{
const struct rtnl_link_ops *ops = NULL;
@@ -2709,6 +2753,24 @@ static int do_setlink(const struct sk_buff *skb,
}
}

+ if (tb[IFLA_XPS]) {
+ struct nlattr *xps[IFLA_XPS_MAX + 1];
+
+ err = nla_parse_nested_deprecated(xps, IFLA_XPS_MAX,
+ tb[IFLA_XPS],
+ ifla_xps_policy, NULL);
+ if (err < 0)
+ goto errout;
+
+ if (xps[IFLA_XPS_FD]) {
+ err = dev_change_xps_fd(dev,
+ nla_get_s32(xps[IFLA_XPS_FD]));
+ if (err)
+ goto errout;
+ status |= DO_SETLINK_NOTIFY;
+ }
+ }
+
errout:
if (status & DO_SETLINK_MODIFIED) {
if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
--
1.8.3.1