[PATCH v5 net-next 6/9] net/sched: mqprio: allow per-TC user input of FP adminStatus

From: Vladimir Oltean
Date: Tue Apr 11 2023 - 14:03:08 EST


IEEE 802.1Q-2018 clause 6.7.2 Frame preemption specifies that each
packet priority can be assigned to a "frame preemption status" value of
either "express" or "preemptible". Express priorities are transmitted by
the local device through the eMAC, and preemptible priorities through
the pMAC (the concepts of eMAC and pMAC come from the 802.3 MAC Merge
layer).

The FP adminStatus is defined per packet priority, but 802.1Q clause
12.30.1.1.1 framePreemptionAdminStatus also says that:

| Priorities that all map to the same traffic class should be
| constrained to use the same value of preemption status.

It is impossible to ignore the cognitive dissonance in the standard
here, because it practically means that the FP adminStatus only takes
distinct values per traffic class, even though it is defined per
priority.

I can see no valid use case which is prevented by having the kernel take
the FP adminStatus as input per traffic class (what we do here).
In addition, this also enforces the above constraint by construction.
User space network managers which wish to expose FP adminStatus per
priority are free to do so; they must only observe the prio_tc_map of
the netdev (which presumably is also under their control, when
constructing the mqprio netlink attributes).

The reason for configuring frame preemption as a property of the Qdisc
layer is that the information about "preemptible TCs" is closest to the
place which handles the num_tc and prio_tc_map of the netdev. If the
UAPI would have been any other layer, it would be unclear what to do
with the FP information when num_tc collapses to 0. A key assumption is
that only mqprio/taprio change the num_tc and prio_tc_map of the netdev.
Not sure if that's a great assumption to make.

Having FP in tc-mqprio can be seen as an implementation of the use case
defined in 802.1Q Annex S.2 "Preemption used in isolation". There will
be a separate implementation of FP in tc-taprio, for the other use
cases.

Signed-off-by: Vladimir Oltean <vladimir.oltean@xxxxxxx>
Reviewed-by: Ferenc Fejes <fejes@xxxxxxxxxxx>
Reviewed-by: Simon Horman <simon.horman@xxxxxxxxxxxx>
---
v4->v5:
- don't initialize tb twice, nla_parse_nested() does it
- use NL_REQ_ATTR_CHECK() and NL_SET_ERR_MSG_ATTR() for
TCA_MQPRIO_TC_ENTRY_INDEX
v3->v4: none
v2->v3: none
v1->v2:
- slightly reword commit message
- move #include <linux/ethtool_netlink.h> to this patch
- remove self-evident comment "only for dump and offloading"

include/net/pkt_sched.h | 1 +
include/uapi/linux/pkt_sched.h | 16 +++++
net/sched/sch_mqprio.c | 128 ++++++++++++++++++++++++++++++++-
net/sched/sch_mqprio_lib.c | 14 ++++
net/sched/sch_mqprio_lib.h | 2 +
5 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index b43ed4733455..f436688b6efc 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -172,6 +172,7 @@ struct tc_mqprio_qopt_offload {
u32 flags;
u64 min_rate[TC_QOPT_MAX_QUEUE];
u64 max_rate[TC_QOPT_MAX_QUEUE];
+ unsigned long preemptible_tcs;
};

struct tc_taprio_caps {
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 000eec106856..b8d29be91b62 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -719,6 +719,11 @@ enum {

#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)

+enum {
+ TC_FP_EXPRESS = 1,
+ TC_FP_PREEMPTIBLE = 2,
+};
+
struct tc_mqprio_qopt {
__u8 num_tc;
__u8 prio_tc_map[TC_QOPT_BITMASK + 1];
@@ -732,12 +737,23 @@ struct tc_mqprio_qopt {
#define TC_MQPRIO_F_MIN_RATE 0x4
#define TC_MQPRIO_F_MAX_RATE 0x8

+enum {
+ TCA_MQPRIO_TC_ENTRY_UNSPEC,
+ TCA_MQPRIO_TC_ENTRY_INDEX, /* u32 */
+ TCA_MQPRIO_TC_ENTRY_FP, /* u32 */
+
+ /* add new constants above here */
+ __TCA_MQPRIO_TC_ENTRY_CNT,
+ TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1)
+};
+
enum {
TCA_MQPRIO_UNSPEC,
TCA_MQPRIO_MODE,
TCA_MQPRIO_SHAPER,
TCA_MQPRIO_MIN_RATE64,
TCA_MQPRIO_MAX_RATE64,
+ TCA_MQPRIO_TC_ENTRY,
__TCA_MQPRIO_MAX,
};

diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 67d77495c8fd..dc5a0ff50b14 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -5,6 +5,7 @@
* Copyright (c) 2010 John Fastabend <john.r.fastabend@xxxxxxxxx>
*/

+#include <linux/ethtool_netlink.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -27,6 +28,7 @@ struct mqprio_sched {
u32 flags;
u64 min_rate[TC_QOPT_MAX_QUEUE];
u64 max_rate[TC_QOPT_MAX_QUEUE];
+ u32 fp[TC_QOPT_MAX_QUEUE];
};

static int mqprio_enable_offload(struct Qdisc *sch,
@@ -63,6 +65,8 @@ static int mqprio_enable_offload(struct Qdisc *sch,
return -EINVAL;
}

+ mqprio_fp_to_offload(priv->fp, &mqprio);
+
err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
&mqprio);
if (err)
@@ -145,13 +149,95 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
return 0;
}

+static const struct
+nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = {
+ [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32,
+ TC_QOPT_MAX_QUEUE),
+ [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32,
+ TC_FP_EXPRESS,
+ TC_FP_PREEMPTIBLE),
+};
+
static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
[TCA_MQPRIO_MODE] = { .len = sizeof(u16) },
[TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) },
[TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED },
[TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED },
+ [TCA_MQPRIO_TC_ENTRY] = { .type = NLA_NESTED },
};

+static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct nlattr *opt,
+ unsigned long *seen_tcs,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1];
+ int err, tc;
+
+ err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt,
+ mqprio_tc_entry_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_MQPRIO_TC_ENTRY_INDEX)) {
+ NL_SET_ERR_MSG(extack, "TC entry index missing");
+ return -EINVAL;
+ }
+
+ tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]);
+ if (*seen_tcs & BIT(tc)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_TC_ENTRY_INDEX],
+ "Duplicate tc entry");
+ return -EINVAL;
+ }
+
+ *seen_tcs |= BIT(tc);
+
+ if (tb[TCA_MQPRIO_TC_ENTRY_FP])
+ fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]);
+
+ return 0;
+}
+
+static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt,
+ int nlattr_opt_len,
+ struct netlink_ext_ack *extack)
+{
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ bool have_preemption = false;
+ unsigned long seen_tcs = 0;
+ u32 fp[TC_QOPT_MAX_QUEUE];
+ struct nlattr *n;
+ int tc, rem;
+ int err = 0;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ fp[tc] = priv->fp[tc];
+
+ nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) {
+ if (nla_type(n) != TCA_MQPRIO_TC_ENTRY)
+ continue;
+
+ err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack);
+ if (err)
+ goto out;
+ }
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ priv->fp[tc] = fp[tc];
+ if (fp[tc] == TC_FP_PREEMPTIBLE)
+ have_preemption = true;
+ }
+
+ if (have_preemption && !ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG(extack, "Device does not support preemption");
+ return -EOPNOTSUPP;
+ }
+out:
+ return err;
+}
+
/* Parse the other netlink attributes that represent the payload of
* TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt.
*/
@@ -234,6 +320,13 @@ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
priv->flags |= TC_MQPRIO_F_MAX_RATE;
}

+ if (tb[TCA_MQPRIO_TC_ENTRY]) {
+ err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len,
+ extack);
+ if (err)
+ return err;
+ }
+
return 0;
}

@@ -247,7 +340,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
int i, err = -EOPNOTSUPP;
struct tc_mqprio_qopt *qopt = NULL;
struct tc_mqprio_caps caps;
- int len;
+ int len, tc;

BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
@@ -265,6 +358,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt || nla_len(opt) < sizeof(*qopt))
return -EINVAL;

+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ priv->fp[tc] = TC_FP_EXPRESS;
+
qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO,
&caps, sizeof(caps));

@@ -415,6 +511,33 @@ static int dump_rates(struct mqprio_sched *priv,
return -1;
}

+static int mqprio_dump_tc_entries(struct mqprio_sched *priv,
+ struct sk_buff *skb)
+{
+ struct nlattr *n;
+ int tc;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY);
+ if (!n)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc]))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, n);
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, n);
+ return -EMSGSIZE;
+}
+
static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
@@ -465,6 +588,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
(dump_rates(priv, &opt, skb) != 0))
goto nla_put_failure;

+ if (mqprio_dump_tc_entries(priv, skb))
+ goto nla_put_failure;
+
return nla_nest_end(skb, nla);
nla_put_failure:
nlmsg_trim(skb, nla);
diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c
index c58a533b8ec5..83b3793c4012 100644
--- a/net/sched/sch_mqprio_lib.c
+++ b/net/sched/sch_mqprio_lib.c
@@ -114,4 +114,18 @@ void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt
}
EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct);

+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct tc_mqprio_qopt_offload *mqprio)
+{
+ unsigned long preemptible_tcs = 0;
+ int tc;
+
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
+ if (fp[tc] == TC_FP_PREEMPTIBLE)
+ preemptible_tcs |= BIT(tc);
+
+ mqprio->preemptible_tcs = preemptible_tcs;
+}
+EXPORT_SYMBOL_GPL(mqprio_fp_to_offload);
+
MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h
index 63f725ab8761..079f597072e3 100644
--- a/net/sched/sch_mqprio_lib.h
+++ b/net/sched/sch_mqprio_lib.h
@@ -14,5 +14,7 @@ int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
struct netlink_ext_ack *extack);
void mqprio_qopt_reconstruct(struct net_device *dev,
struct tc_mqprio_qopt *qopt);
+void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
+ struct tc_mqprio_qopt_offload *mqprio);

#endif
--
2.34.1