Re: [PATCH v4 net-next 6/9] net/sched: mqprio: allow per-TC user input of FP adminStatus

From: Jamal Hadi Salim
Date: Fri Apr 07 2023 - 12:22:47 EST


On Mon, Apr 3, 2023 at 6:35 AM Vladimir Oltean <vladimir.oltean@xxxxxxx> wrote:
>
> IEEE 802.1Q-2018 clause 6.7.2 Frame preemption specifies that each
> packet priority can be assigned to a "frame preemption status" value of
> either "express" or "preemptible". Express priorities are transmitted by
> the local device through the eMAC, and preemptible priorities through
> the pMAC (the concepts of eMAC and pMAC come from the 802.3 MAC Merge
> layer).
>
> The FP adminStatus is defined per packet priority, but 802.1Q clause
> 12.30.1.1.1 framePreemptionAdminStatus also says that:
>
> | Priorities that all map to the same traffic class should be
> | constrained to use the same value of preemption status.
>
> It is impossible to ignore the cognitive dissonance in the standard
> here, because it practically means that the FP adminStatus only takes
> distinct values per traffic class, even though it is defined per
> priority.
>
> I can see no valid use case which is prevented by having the kernel take
> the FP adminStatus as input per traffic class (what we do here).
> In addition, this also enforces the above constraint by construction.
> User space network managers which wish to expose FP adminStatus per
> priority are free to do so; they must only observe the prio_tc_map of
> the netdev (which presumably is also under their control, when
> constructing the mqprio netlink attributes).
>
> The reason for configuring frame preemption as a property of the Qdisc
> layer is that the information about "preemptible TCs" is closest to the
> place which handles the num_tc and prio_tc_map of the netdev. If the
> UAPI would have been any other layer, it would be unclear what to do
> with the FP information when num_tc collapses to 0. A key assumption is
> that only mqprio/taprio change the num_tc and prio_tc_map of the netdev.
> Not sure if that's a great assumption to make.
>
> Having FP in tc-mqprio can be seen as an implementation of the use case
> defined in 802.1Q Annex S.2 "Preemption used in isolation". There will
> be a separate implementation of FP in tc-taprio, for the other use
> cases.
>
> Signed-off-by: Vladimir Oltean <vladimir.oltean@xxxxxxx>
> Reviewed-by: Ferenc Fejes <fejes@xxxxxxxxxxx>
> Reviewed-by: Simon Horman <simon.horman@xxxxxxxxxxxx>
> ---
> v3->v4: none
> v2->v3: none
> v1->v2:
> - slightly reword commit message
> - move #include <linux/ethtool_netlink.h> to this patch
> - remove self-evident comment "only for dump and offloading"
>
> include/net/pkt_sched.h | 1 +
> include/uapi/linux/pkt_sched.h | 16 +++++
> net/sched/sch_mqprio.c | 127 ++++++++++++++++++++++++++++++++-
> net/sched/sch_mqprio_lib.c | 14 ++++
> net/sched/sch_mqprio_lib.h | 2 +
> 5 files changed, 159 insertions(+), 1 deletion(-)
>
> diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
> index b43ed4733455..f436688b6efc 100644
> --- a/include/net/pkt_sched.h
> +++ b/include/net/pkt_sched.h
> @@ -172,6 +172,7 @@ struct tc_mqprio_qopt_offload {
> u32 flags;
> u64 min_rate[TC_QOPT_MAX_QUEUE];
> u64 max_rate[TC_QOPT_MAX_QUEUE];
> + unsigned long preemptible_tcs;
> };
>
> struct tc_taprio_caps {
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 000eec106856..b8d29be91b62 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -719,6 +719,11 @@ enum {
>
> #define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1)
>
> +enum {
> + TC_FP_EXPRESS = 1,
> + TC_FP_PREEMPTIBLE = 2,
> +};

Suggestion: Add a MAX to this enum (as is traditionally done)..

> +
> struct tc_mqprio_qopt {
> __u8 num_tc;
> __u8 prio_tc_map[TC_QOPT_BITMASK + 1];
> @@ -732,12 +737,23 @@ struct tc_mqprio_qopt {
> #define TC_MQPRIO_F_MIN_RATE 0x4
> #define TC_MQPRIO_F_MAX_RATE 0x8
>
> +enum {
> + TCA_MQPRIO_TC_ENTRY_UNSPEC,
> + TCA_MQPRIO_TC_ENTRY_INDEX, /* u32 */
> + TCA_MQPRIO_TC_ENTRY_FP, /* u32 */
> +
> + /* add new constants above here */
> + __TCA_MQPRIO_TC_ENTRY_CNT,
> + TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1)
> +};
> +
> enum {
> TCA_MQPRIO_UNSPEC,
> TCA_MQPRIO_MODE,
> TCA_MQPRIO_SHAPER,
> TCA_MQPRIO_MIN_RATE64,
> TCA_MQPRIO_MAX_RATE64,
> + TCA_MQPRIO_TC_ENTRY,
> __TCA_MQPRIO_MAX,
> };
>
> diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
> index 5287ff60b3f9..bc158a7fd6ba 100644
> --- a/net/sched/sch_mqprio.c
> +++ b/net/sched/sch_mqprio.c
> @@ -5,6 +5,7 @@
> * Copyright (c) 2010 John Fastabend <john.r.fastabend@xxxxxxxxx>
> */
>
> +#include <linux/ethtool_netlink.h>
> #include <linux/types.h>
> #include <linux/slab.h>
> #include <linux/kernel.h>
> @@ -27,6 +28,7 @@ struct mqprio_sched {
> u32 flags;
> u64 min_rate[TC_QOPT_MAX_QUEUE];
> u64 max_rate[TC_QOPT_MAX_QUEUE];
> + u32 fp[TC_QOPT_MAX_QUEUE];
> };
>
> static int mqprio_enable_offload(struct Qdisc *sch,
> @@ -63,6 +65,8 @@ static int mqprio_enable_offload(struct Qdisc *sch,
> return -EINVAL;
> }
>
> + mqprio_fp_to_offload(priv->fp, &mqprio);
> +
> err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
> &mqprio);
> if (err)
> @@ -145,13 +149,94 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
> return 0;
> }
>
> +static const struct
> +nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = {
> + [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32,
> + TC_QOPT_MAX_QUEUE),

And use it here...

Out of curiosity, could you have more that 16 queues in this spec? I
noticed 802.1p mentioned somewhere (typically 3 bits).
Lead up question: if the max is 16 then can preemptible_tcs for example be u32?

cheers,
jamal

> + [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32,
> + TC_FP_EXPRESS,
> + TC_FP_PREEMPTIBLE),
> +};
> +
> static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = {
> [TCA_MQPRIO_MODE] = { .len = sizeof(u16) },
> [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) },
> [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED },
> [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED },
> + [TCA_MQPRIO_TC_ENTRY] = { .type = NLA_NESTED },
> };
>
> +static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE],
> + struct nlattr *opt,
> + unsigned long *seen_tcs,
> + struct netlink_ext_ack *extack)
> +{
> + struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1] = { };
> + int err, tc;
> +
> + err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt,
> + mqprio_tc_entry_policy, extack);
> + if (err < 0)
> + return err;
> +
> + if (!tb[TCA_MQPRIO_TC_ENTRY_INDEX]) {
> + NL_SET_ERR_MSG(extack, "TC entry index missing");
> + return -EINVAL;
> + }
> +
> + tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]);
> + if (*seen_tcs & BIT(tc)) {
> + NL_SET_ERR_MSG(extack, "Duplicate tc entry");
> + return -EINVAL;
> + }
> +
> + *seen_tcs |= BIT(tc);
> +
> + if (tb[TCA_MQPRIO_TC_ENTRY_FP])
> + fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]);
> +
> + return 0;
> +}
> +
> +static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt,
> + int nlattr_opt_len,
> + struct netlink_ext_ack *extack)
> +{
> + struct mqprio_sched *priv = qdisc_priv(sch);
> + struct net_device *dev = qdisc_dev(sch);
> + bool have_preemption = false;
> + unsigned long seen_tcs = 0;
> + u32 fp[TC_QOPT_MAX_QUEUE];
> + struct nlattr *n;
> + int tc, rem;
> + int err = 0;
> +
> + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
> + fp[tc] = priv->fp[tc];
> +
> + nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) {
> + if (nla_type(n) != TCA_MQPRIO_TC_ENTRY)
> + continue;
> +
> + err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack);
> + if (err)
> + goto out;
> + }
> +
> + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
> + priv->fp[tc] = fp[tc];
> + if (fp[tc] == TC_FP_PREEMPTIBLE)
> + have_preemption = true;
> + }
> +
> + if (have_preemption && !ethtool_dev_mm_supported(dev)) {
> + NL_SET_ERR_MSG(extack, "Device does not support preemption");
> + return -EOPNOTSUPP;
> + }
> +out:
> + return err;
> +}
> +
> /* Parse the other netlink attributes that represent the payload of
> * TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt.
> */
> @@ -234,6 +319,13 @@ static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
> priv->flags |= TC_MQPRIO_F_MAX_RATE;
> }
>
> + if (tb[TCA_MQPRIO_TC_ENTRY]) {
> + err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len,
> + extack);
> + if (err)
> + return err;
> + }
> +
> return 0;
> }
>
> @@ -247,7 +339,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
> int i, err = -EOPNOTSUPP;
> struct tc_mqprio_qopt *qopt = NULL;
> struct tc_mqprio_caps caps;
> - int len;
> + int len, tc;
>
> BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
> BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
> @@ -265,6 +357,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
> if (!opt || nla_len(opt) < sizeof(*qopt))
> return -EINVAL;
>
> + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
> + priv->fp[tc] = TC_FP_EXPRESS;
> +
> qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO,
> &caps, sizeof(caps));
>
> @@ -415,6 +510,33 @@ static int dump_rates(struct mqprio_sched *priv,
> return -1;
> }
>
> +static int mqprio_dump_tc_entries(struct mqprio_sched *priv,
> + struct sk_buff *skb)
> +{
> + struct nlattr *n;
> + int tc;
> +
> + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
> + n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY);
> + if (!n)
> + return -EMSGSIZE;
> +
> + if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc))
> + goto nla_put_failure;
> +
> + if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc]))
> + goto nla_put_failure;
> +
> + nla_nest_end(skb, n);
> + }
> +
> + return 0;
> +
> +nla_put_failure:
> + nla_nest_cancel(skb, n);
> + return -EMSGSIZE;
> +}
> +
> static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
> {
> struct net_device *dev = qdisc_dev(sch);
> @@ -465,6 +587,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
> (dump_rates(priv, &opt, skb) != 0))
> goto nla_put_failure;
>
> + if (mqprio_dump_tc_entries(priv, skb))
> + goto nla_put_failure;
> +
> return nla_nest_end(skb, nla);
> nla_put_failure:
> nlmsg_trim(skb, nla);
> diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c
> index c58a533b8ec5..83b3793c4012 100644
> --- a/net/sched/sch_mqprio_lib.c
> +++ b/net/sched/sch_mqprio_lib.c
> @@ -114,4 +114,18 @@ void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt
> }
> EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct);
>
> +void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
> + struct tc_mqprio_qopt_offload *mqprio)
> +{
> + unsigned long preemptible_tcs = 0;
> + int tc;
> +
> + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
> + if (fp[tc] == TC_FP_PREEMPTIBLE)
> + preemptible_tcs |= BIT(tc);
> +
> + mqprio->preemptible_tcs = preemptible_tcs;
> +}
> +EXPORT_SYMBOL_GPL(mqprio_fp_to_offload);
> +
> MODULE_LICENSE("GPL");
> diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h
> index 63f725ab8761..079f597072e3 100644
> --- a/net/sched/sch_mqprio_lib.h
> +++ b/net/sched/sch_mqprio_lib.h
> @@ -14,5 +14,7 @@ int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
> struct netlink_ext_ack *extack);
> void mqprio_qopt_reconstruct(struct net_device *dev,
> struct tc_mqprio_qopt *qopt);
> +void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE],
> + struct tc_mqprio_qopt_offload *mqprio);
>
> #endif
> --
> 2.34.1
>