Re: [PATCH v8 net-next 3/4] net: enetc: add LSO support for i.MX95 ENETC PF
From: Alexander Lobakin
Date: Tue Dec 17 2024 - 10:33:52 EST
From: Wei Fang <wei.fang@xxxxxxx>
Date: Fri, 13 Dec 2024 10:17:30 +0800
> ENETC rev 4.1 supports large send offload (LSO), segmenting large TCP
> and UDP transmit units into multiple Ethernet frames. To support LSO,
> software needs to fill some auxiliary information in Tx BD, such as LSO
> header length, frame length, LSO maximum segment size, etc.
>
> At 1Gbps link rate, TCP segmentation was tested using iperf3, and the
> CPU performance before and after applying the patch was compared through
> the top command. It can be seen that LSO saves a significant amount of
> CPU cycles compared to software TSO.
>
> Before applying the patch:
> %Cpu(s): 0.1 us, 4.1 sy, 0.0 ni, 85.7 id, 0.0 wa, 0.5 hi, 9.7 si
>
> After applying the patch:
> %Cpu(s): 0.1 us, 2.3 sy, 0.0 ni, 94.5 id, 0.0 wa, 0.4 hi, 2.6 si
>
> Signed-off-by: Wei Fang <wei.fang@xxxxxxx>
> Reviewed-by: Frank Li <Frank.Li@xxxxxxx>
> Reviewed-by: Claudiu Manoil <claudiu.manoil@xxxxxxx>
> ---
> v2: no changes
> v3: use enetc_skb_is_ipv6() helper fucntion which is added in patch 2
> v4: fix a typo
> v5: no changes
> v6: remove error logs from the datapath
> v7: rebase the patch due to the layout change of enetc_tx_bd
> v8: rebase the patch due to merge conflicts
> ---
> drivers/net/ethernet/freescale/enetc/enetc.c | 257 +++++++++++++++++-
> drivers/net/ethernet/freescale/enetc/enetc.h | 15 +
> .../net/ethernet/freescale/enetc/enetc4_hw.h | 22 ++
> .../net/ethernet/freescale/enetc/enetc_hw.h | 14 +-
> .../freescale/enetc/enetc_pf_common.c | 3 +
> 5 files changed, 301 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c
> index 09ca4223ff9d..41a3798c7564 100644
> --- a/drivers/net/ethernet/freescale/enetc/enetc.c
> +++ b/drivers/net/ethernet/freescale/enetc/enetc.c
> @@ -533,6 +533,224 @@ static void enetc_tso_complete_csum(struct enetc_bdr *tx_ring, struct tso_t *tso
> }
> }
>
> +static inline int enetc_lso_count_descs(const struct sk_buff *skb)
> +{
> + /* 4 BDs: 1 BD for LSO header + 1 BD for extended BD + 1 BD
> + * for linear area data but not include LSO header, namely
> + * skb_headlen(skb) - lso_hdr_len. And 1 BD for gap.
What if the head contains headers only and
`skb_headlen(skb) - lso_hdr_len` is 0?
> + */
> + return skb_shinfo(skb)->nr_frags + 4;
> +}
> +
> +static int enetc_lso_get_hdr_len(const struct sk_buff *skb)
> +{
> + int hdr_len, tlen;
> +
> + tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr);
> + hdr_len = skb_transport_offset(skb) + tlen;
> +
> + return hdr_len;
> +}
Are you sure the kernel doesn't have similar generic helpers?
> +
> +static void enetc_lso_start(struct sk_buff *skb, struct enetc_lso_t *lso)
> +{
> + lso->lso_seg_size = skb_shinfo(skb)->gso_size;
> + lso->ipv6 = enetc_skb_is_ipv6(skb);
> + lso->tcp = skb_is_gso_tcp(skb);
> + lso->l3_hdr_len = skb_network_header_len(skb);
> + lso->l3_start = skb_network_offset(skb);
> + lso->hdr_len = enetc_lso_get_hdr_len(skb);
> + lso->total_len = skb->len - lso->hdr_len;
> +}
> +
> +static void enetc_lso_map_hdr(struct enetc_bdr *tx_ring, struct sk_buff *skb,
> + int *i, struct enetc_lso_t *lso)
> +{
> + union enetc_tx_bd txbd_tmp, *txbd;
> + struct enetc_tx_swbd *tx_swbd;
> + u16 frm_len, frm_len_ext;
> + u8 flags, e_flags = 0;
> + dma_addr_t addr;
> + char *hdr;
> +
> + /* Get the first BD of the LSO BDs chain */
> + txbd = ENETC_TXBD(*tx_ring, *i);
> + tx_swbd = &tx_ring->tx_swbd[*i];
> + prefetchw(txbd);
Is this prefetchw() proven to give any benefit?
> +
> + /* Prepare LSO header: MAC + IP + TCP/UDP */
> + hdr = tx_ring->tso_headers + *i * TSO_HEADER_SIZE;
> + memcpy(hdr, skb->data, lso->hdr_len);
> + addr = tx_ring->tso_headers_dma + *i * TSO_HEADER_SIZE;
> +
> + frm_len = lso->total_len & 0xffff;
> + frm_len_ext = (lso->total_len >> 16) & 0xf;
Why are these magics just open-coded, even without any comment?
I have no idea what is going on here for example.
Also, `& 0xffff` is lower_16_bits(), while `lso->total_len >> 16` is
upper_16_bits().
> +
> + /* Set the flags of the first BD */
> + flags = ENETC_TXBD_FLAGS_EX | ENETC_TXBD_FLAGS_CSUM_LSO |
> + ENETC_TXBD_FLAGS_LSO | ENETC_TXBD_FLAGS_L4CS;
> +
> + enetc_clear_tx_bd(&txbd_tmp);
> + txbd_tmp.addr = cpu_to_le64(addr);
> + txbd_tmp.hdr_len = cpu_to_le16(lso->hdr_len);
> +
> + /* first BD needs frm_len and offload flags set */
> + txbd_tmp.frm_len = cpu_to_le16(frm_len);
> + txbd_tmp.flags = flags;
> +
> + txbd_tmp.l3_aux0 = FIELD_PREP(ENETC_TX_BD_L3_START, lso->l3_start);
> + /* l3_hdr_size in 32-bits (4 bytes) */
> + txbd_tmp.l3_aux1 = FIELD_PREP(ENETC_TX_BD_L3_HDR_LEN,
> + lso->l3_hdr_len / 4);
> + if (lso->ipv6)
> + txbd_tmp.l3_aux1 |= FIELD_PREP(ENETC_TX_BD_L3T, 1);
> + else
> + txbd_tmp.l3_aux0 |= FIELD_PREP(ENETC_TX_BD_IPCS, 1);
Both these "fields" are single bits. You don't need FIELD_PREP() for
single-bit fields, just `|= ENETC_TX_BD_L3T` etc.
> +
> + txbd_tmp.l4_aux = FIELD_PREP(ENETC_TX_BD_L4T, lso->tcp ?
> + ENETC_TXBD_L4T_TCP : ENETC_TXBD_L4T_UDP);
> +
> + /* For the LSO header we do not set the dma address since
> + * we do not want it unmapped when we do cleanup. We still
> + * set len so that we count the bytes sent.
> + */
> + tx_swbd->len = lso->hdr_len;
> + tx_swbd->do_twostep_tstamp = false;
> + tx_swbd->check_wb = false;
> +
> + /* Actually write the header in the BD */
> + *txbd = txbd_tmp;
> +
> + /* Get the next BD, and the next BD is extended BD */
> + enetc_bdr_idx_inc(tx_ring, i);
> + txbd = ENETC_TXBD(*tx_ring, *i);
> + tx_swbd = &tx_ring->tx_swbd[*i];
> + prefetchw(txbd);
(same question as for the previous prefetchw())
> +
> + enetc_clear_tx_bd(&txbd_tmp);
> + if (skb_vlan_tag_present(skb)) {
> + /* Setup the VLAN fields */
> + txbd_tmp.ext.vid = cpu_to_le16(skb_vlan_tag_get(skb));
> + txbd_tmp.ext.tpid = 0; /* < C-TAG */
???
Maybe #define it somewhere, that 0 means CVLAN etc.?
> + e_flags = ENETC_TXBD_E_FLAGS_VLAN_INS;
> + }
> +
> + /* Write the BD */
> + txbd_tmp.ext.e_flags = e_flags;
> + txbd_tmp.ext.lso_sg_size = cpu_to_le16(lso->lso_seg_size);
> + txbd_tmp.ext.frm_len_ext = cpu_to_le16(frm_len_ext);
> + *txbd = txbd_tmp;
> +}
> +
> +static int enetc_lso_map_data(struct enetc_bdr *tx_ring, struct sk_buff *skb,
> + int *i, struct enetc_lso_t *lso, int *count)
> +{
> + union enetc_tx_bd txbd_tmp, *txbd = NULL;
> + struct enetc_tx_swbd *tx_swbd;
> + skb_frag_t *frag;
> + dma_addr_t dma;
> + u8 flags = 0;
> + int len, f;
> +
> + len = skb_headlen(skb) - lso->hdr_len;
> + if (len > 0) {
> + dma = dma_map_single(tx_ring->dev, skb->data + lso->hdr_len,
> + len, DMA_TO_DEVICE);
> + if (unlikely(dma_mapping_error(tx_ring->dev, dma)))
dma_mapping_error() already contains unlikely().
> + return -ENOMEM;
> +
> + enetc_bdr_idx_inc(tx_ring, i);
> + txbd = ENETC_TXBD(*tx_ring, *i);
> + tx_swbd = &tx_ring->tx_swbd[*i];
> + prefetchw(txbd);
> + *count += 1;
> +
> + enetc_clear_tx_bd(&txbd_tmp);
> + txbd_tmp.addr = cpu_to_le64(dma);
> + txbd_tmp.buf_len = cpu_to_le16(len);
> +
> + tx_swbd->dma = dma;
> + tx_swbd->len = len;
> + tx_swbd->is_dma_page = 0;
> + tx_swbd->dir = DMA_TO_DEVICE;
> + }
> +
> + frag = &skb_shinfo(skb)->frags[0];
> + for (f = 0; f < skb_shinfo(skb)->nr_frags; f++, frag++) {
> + if (txbd)
> + *txbd = txbd_tmp;
> +
> + len = skb_frag_size(frag);
> + dma = skb_frag_dma_map(tx_ring->dev, frag, 0, len,
> + DMA_TO_DEVICE);
You now can use skb_frag_dma_map() with 2-4 arguments, so this can be
replaced to
dma = skb_frag_dma_map(tx_ring->dev, frag);
> + if (unlikely(dma_mapping_error(tx_ring->dev, dma)))
> + return -ENOMEM;
> +
> + /* Get the next BD */
> + enetc_bdr_idx_inc(tx_ring, i);
> + txbd = ENETC_TXBD(*tx_ring, *i);
> + tx_swbd = &tx_ring->tx_swbd[*i];
> + prefetchw(txbd);
> + *count += 1;
> +
> + enetc_clear_tx_bd(&txbd_tmp);
> + txbd_tmp.addr = cpu_to_le64(dma);
> + txbd_tmp.buf_len = cpu_to_le16(len);
> +
> + tx_swbd->dma = dma;
> + tx_swbd->len = len;
> + tx_swbd->is_dma_page = 1;
> + tx_swbd->dir = DMA_TO_DEVICE;
> + }
> +
> + /* Last BD needs 'F' bit set */
> + flags |= ENETC_TXBD_FLAGS_F;
> + txbd_tmp.flags = flags;
> + *txbd = txbd_tmp;
> +
> + tx_swbd->is_eof = 1;
> + tx_swbd->skb = skb;
> +
> + return 0;
> +}
[...]
> @@ -2096,6 +2329,13 @@ static int enetc_setup_default_rss_table(struct enetc_si *si, int num_groups)
> return 0;
> }
>
> +static void enetc_set_lso_flags_mask(struct enetc_hw *hw)
> +{
> + enetc_wr(hw, ENETC4_SILSOSFMR0,
> + SILSOSFMR0_VAL_SET(TCP_NL_SEG_FLAGS_DMASK, TCP_NL_SEG_FLAGS_DMASK));
> + enetc_wr(hw, ENETC4_SILSOSFMR1, 0);
> +}
> +
> int enetc_configure_si(struct enetc_ndev_priv *priv)
> {
> struct enetc_si *si = priv->si;
> @@ -2109,6 +2349,9 @@ int enetc_configure_si(struct enetc_ndev_priv *priv)
> /* enable SI */
> enetc_wr(hw, ENETC_SIMR, ENETC_SIMR_EN);
>
> + if (si->hw_features & ENETC_SI_F_LSO)
> + enetc_set_lso_flags_mask(hw);
> +
> /* TODO: RSS support for i.MX95 will be supported later, and the
> * is_enetc_rev1() condition will be removed
> */
> diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h
> index 1e680f0f5123..6db6b3eee45c 100644
> --- a/drivers/net/ethernet/freescale/enetc/enetc.h
> +++ b/drivers/net/ethernet/freescale/enetc/enetc.h
> @@ -41,6 +41,19 @@ struct enetc_tx_swbd {
> u8 qbv_en:1;
> };
>
> +struct enetc_lso_t {
> + bool ipv6;
> + bool tcp;
> + u8 l3_hdr_len;
> + u8 hdr_len; /* LSO header length */
> + u8 l3_start;
> + u16 lso_seg_size;
> + int total_len; /* total data length, not include LSO header */
> +};
> +
> +#define ENETC_1KB_SIZE 1024
SZ_1K
> +#define ENETC_LSO_MAX_DATA_LEN (256 * ENETC_1KB_SIZE)
SZ_256K
> +
> #define ENETC_RX_MAXFRM_SIZE ENETC_MAC_MAXFRM_SIZE
> #define ENETC_RXB_TRUESIZE 2048 /* PAGE_SIZE >> 1 */
> #define ENETC_RXB_PAD NET_SKB_PAD /* add extra space if needed */
> @@ -238,6 +251,7 @@ enum enetc_errata {
> #define ENETC_SI_F_PSFP BIT(0)
> #define ENETC_SI_F_QBV BIT(1)
> #define ENETC_SI_F_QBU BIT(2)
> +#define ENETC_SI_F_LSO BIT(3)
>
> struct enetc_drvdata {
> u32 pmac_offset; /* Only valid for PSI which supports 802.1Qbu */
> @@ -351,6 +365,7 @@ enum enetc_active_offloads {
> ENETC_F_QCI = BIT(10),
> ENETC_F_QBU = BIT(11),
> ENETC_F_TXCSUM = BIT(12),
> + ENETC_F_LSO = BIT(13),
> };
>
> enum enetc_flags_bit {
> diff --git a/drivers/net/ethernet/freescale/enetc/enetc4_hw.h b/drivers/net/ethernet/freescale/enetc/enetc4_hw.h
> index 26b220677448..cdde8e93a73c 100644
> --- a/drivers/net/ethernet/freescale/enetc/enetc4_hw.h
> +++ b/drivers/net/ethernet/freescale/enetc/enetc4_hw.h
> @@ -12,6 +12,28 @@
> #define NXP_ENETC_VENDOR_ID 0x1131
> #define NXP_ENETC_PF_DEV_ID 0xe101
>
> +/**********************Station interface registers************************/
> +/* Station interface LSO segmentation flag mask register 0/1 */
> +#define ENETC4_SILSOSFMR0 0x1300
> +#define SILSOSFMR0_TCP_MID_SEG GENMASK(27, 16)
> +#define SILSOSFMR0_TCP_1ST_SEG GENMASK(11, 0)
> +#define SILSOSFMR0_VAL_SET(first, mid) ((((mid) << 16) & SILSOSFMR0_TCP_MID_SEG) | \
Why not FIELD_PREP()?
> + ((first) & SILSOSFMR0_TCP_1ST_SEG))
> +
> +#define ENETC4_SILSOSFMR1 0x1304
> +#define SILSOSFMR1_TCP_LAST_SEG GENMASK(11, 0)
> +#define TCP_FLAGS_FIN BIT(0)
> +#define TCP_FLAGS_SYN BIT(1)
> +#define TCP_FLAGS_RST BIT(2)
> +#define TCP_FLAGS_PSH BIT(3)
> +#define TCP_FLAGS_ACK BIT(4)
> +#define TCP_FLAGS_URG BIT(5)
> +#define TCP_FLAGS_ECE BIT(6)
> +#define TCP_FLAGS_CWR BIT(7)
> +#define TCP_FLAGS_NS BIT(8)
Why are you open-coding these if they're present in uapi/linux/tcp.h?
> +/* According to tso_build_hdr(), clear all special flags for not last packet. */
But this mask is used only to do a writel(), I don't see it anywhere
clearing anything...
> +#define TCP_NL_SEG_FLAGS_DMASK (TCP_FLAGS_FIN | TCP_FLAGS_RST | TCP_FLAGS_PSH)
> +
> /***************************ENETC port registers**************************/
> #define ENETC4_ECAPR0 0x0
> #define ECAPR0_RFS BIT(2)
Thanks,
Olek