Re: [net-next v7 07/10] net: bnxt: Implement software USO

From: Eric Dumazet

Date: Wed Apr 01 2026 - 20:39:06 EST


On Wed, Apr 1, 2026 at 4:38 PM Joe Damato <joe@xxxxxxx> wrote:
>
> Implement bnxt_sw_udp_gso_xmit() using the core tso_dma_map API and
> the pre-allocated TX inline buffer for per-segment headers.
>
> The xmit path:
> 1. Calls tso_start() to initialize TSO state
> 2. Stack-allocates a tso_dma_map and calls tso_dma_map_init() to
> DMA-map the linear payload and all frags upfront.
> 3. For each segment:
> - Copies and patches headers via tso_build_hdr() into the
> pre-allocated tx_inline_buf (DMA-synced per segment)
> - Counts payload BDs via tso_dma_map_count()
> - Emits long BD (header) + ext BD + payload BDs
> - Payload BDs use tso_dma_map_next() which yields (dma_addr,
> chunk_len, mapping_len) tuples.
>
> Header BDs set dma_unmap_len=0 since the inline buffer is pre-allocated
> and unmapped only at ring teardown.
>
> Completion state is updated by calling tso_dma_map_completion_save() for
> the last segment.
>
> Suggested-by: Jakub Kicinski <kuba@xxxxxxxxxx>
> Signed-off-by: Joe Damato <joe@xxxxxxx>
> ---
> v7:
> - Dropped Pavan's Reviewed-by as some changes were made.
> - Updated struct bnxt_sw_tx_bd to embed a tso_dma_map_completion_state
> struct for tracking completion state.
> - Dropped an unnecessary slot check.
> - Eliminated an ugly looking ternary to simplify the code.
> - Call tso_dma_map_completion_save to update completion state.
>
> v6:
> - Addressed Paolo's feedback where the IOVA API could fail transiently,
> leaving stale state in iova_state. Fix this by always copying the state,
> noting that dma_iova_try_alloc is called unconditionally in the
> tso_dma_map_init function (via tso_dma_iova_try), which zeroes the state
> even if the API can't be used.
> - Since this was a very minor change, I retained Pavan's Reviewed-by.
>
> v5:
> - Added __maybe_unused to last_unmap_len and last_unmap_addr to silence a
> build warning when CONFIG_NEED_DMA_MAP_STATE is disabled. No functional
> changes.
> - Added Pavan's Reviewed-by.
>
> v4:
> - Fixed the early return issue Pavan pointed out when num_segs <= 1; use the
> drop label instead of returning.
>
> v3:
> - Added iova_state and iova_total_len to struct bnxt_sw_tx_bd.
> - Stores iova_state on the last segment's tx_buf during xmit.
>
> rfcv2:
> - set the unmap len on the last descriptor, so that when completions fire
> only the last completion unmaps the region.
>
> drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 +
> drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c | 197 ++++++++++++++++++
> 2 files changed, 200 insertions(+)
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> index b5b84d1e5217..993b215413c7 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
> @@ -11,6 +11,8 @@
> #ifndef BNXT_H
> #define BNXT_H
>
> +#include <net/tso.h>
> +
> #define DRV_MODULE_NAME "bnxt_en"
>
> /* DO NOT CHANGE DRV_VER_* defines
> @@ -899,6 +901,7 @@ struct bnxt_sw_tx_bd {
> u16 rx_prod;
> u16 txts_prod;
> };
> + struct tso_dma_map_completion_state sw_gso_cstate;
> };
>
> #define BNXT_SW_GSO_MID 1
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> index b296769ee4fe..b0f8126b6903 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> @@ -19,11 +19,208 @@
> #include "bnxt.h"
> #include "bnxt_gso.h"
>
> +static u32 bnxt_sw_gso_lhint(unsigned int len)
> +{
> + if (len <= 512)
> + return TX_BD_FLAGS_LHINT_512_AND_SMALLER;
> + else if (len <= 1023)
> + return TX_BD_FLAGS_LHINT_512_TO_1023;
> + else if (len <= 2047)
> + return TX_BD_FLAGS_LHINT_1024_TO_2047;
> + else
> + return TX_BD_FLAGS_LHINT_2048_AND_LARGER;
> +}
> +
> netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp,
> struct bnxt_tx_ring_info *txr,
> struct netdev_queue *txq,
> struct sk_buff *skb)
> {
> + unsigned int last_unmap_len __maybe_unused = 0;
> + dma_addr_t last_unmap_addr __maybe_unused = 0;
> + struct bnxt_sw_tx_bd *last_unmap_buf = NULL;
> + unsigned int hdr_len, mss, num_segs;
> + struct pci_dev *pdev = bp->pdev;
> + unsigned int total_payload;
> + struct tso_dma_map map;
> + u32 vlan_tag_flags = 0;
> + int i, bds_needed;
> + struct tso_t tso;
> + u16 cfa_action;
> + u16 prod;
> +
> + hdr_len = tso_start(skb, &tso);
> + mss = skb_shinfo(skb)->gso_size;
> + total_payload = skb->len - hdr_len;
> + num_segs = DIV_ROUND_UP(total_payload, mss);
> +
> + /* Zero the csum fields so tso_build_hdr will propagate zeroes into
> + * every segment header. HW csum offload will recompute from scratch.
> + */

We might need a call to skb_cow_head(skb, 0) before changing ->check
(or anything in skb->head)

Alternative would be to perform the clears after each tso_build_hdr()
and leave skb->head untouched.


> + udp_hdr(skb)->check = 0;
> + if (!tso.ipv6)
> + ip_hdr(skb)->check = 0;
> +
> + if (unlikely(num_segs <= 1))
> + goto drop;
> +
> + /* Upper bound on the number of descriptors needed.
> + *
> + * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is
> + * at most num_segs + nr_frags (each frag boundary crossing adds at
> + * most 1 extra BD).
> + */
> + bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1;
> +
> + if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) {
> + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
> + bp->tx_wake_thresh);
> + return NETDEV_TX_BUSY;
> + }
> +
> + if (unlikely(tso_dma_map_init(&map, &pdev->dev, skb, hdr_len)))
> + goto drop;
> +
> + cfa_action = bnxt_xmit_get_cfa_action(skb);
> + if (skb_vlan_tag_present(skb)) {
> + vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
> + skb_vlan_tag_get(skb);
> + if (skb->vlan_proto == htons(ETH_P_8021Q))
> + vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT;
> + }
> +
> + prod = txr->tx_prod;
> +
> + for (i = 0; i < num_segs; i++) {
> + unsigned int seg_payload = min_t(unsigned int, mss,
> + total_payload - i * mss);
> + u16 slot = (txr->tx_inline_prod + i) &
> + (BNXT_SW_USO_MAX_SEGS - 1);
> + struct bnxt_sw_tx_bd *tx_buf;
> + unsigned int mapping_len;
> + dma_addr_t this_hdr_dma;
> + unsigned int chunk_len;
> + unsigned int offset;
> + dma_addr_t dma_addr;
> + struct tx_bd *txbd;
> + void *this_hdr;
> + int bd_count;
> + __le32 csum;
> + bool last;
> + u32 flags;
> +
> + last = (i == num_segs - 1);
> + offset = slot * TSO_HEADER_SIZE;
> + this_hdr = txr->tx_inline_buf + offset;
> + this_hdr_dma = txr->tx_inline_dma + offset;
> +
> + tso_build_hdr(skb, this_hdr, &tso, seg_payload, last);
> +
> + dma_sync_single_for_device(&pdev->dev, this_hdr_dma,
> + hdr_len, DMA_TO_DEVICE);
> +
> + bd_count = tso_dma_map_count(&map, seg_payload);
> +
> + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)];
> + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)];
> +
> + tx_buf->skb = skb;
> + tx_buf->nr_frags = bd_count;
> + tx_buf->is_push = 0;
> + tx_buf->is_ts_pkt = 0;
> +
> + dma_unmap_addr_set(tx_buf, mapping, this_hdr_dma);
> + dma_unmap_len_set(tx_buf, len, 0);
> +
> + if (last) {
> + tx_buf->is_sw_gso = BNXT_SW_GSO_LAST;
> + tso_dma_map_completion_save(&map, &tx_buf->sw_gso_cstate);
> + } else {
> + tx_buf->is_sw_gso = BNXT_SW_GSO_MID;
> + }
> +
> + flags = (hdr_len << TX_BD_LEN_SHIFT) |
> + TX_BD_TYPE_LONG_TX_BD |
> + TX_BD_CNT(2 + bd_count);
> +
> + flags |= bnxt_sw_gso_lhint(hdr_len + seg_payload);
> +
> + txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
> + txbd->tx_bd_haddr = cpu_to_le64(this_hdr_dma);
> + txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod,
> + 2 + bd_count);
> +
> + csum = cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM |
> + TX_BD_FLAGS_IP_CKSUM);
> +
> + prod = NEXT_TX(prod);
> + bnxt_init_ext_bd(bp, txr, prod, csum,
> + vlan_tag_flags, cfa_action);
> +
> + /* set dma_unmap_len on the LAST BD touching each
> + * region. Since completions are in-order, the last segment
> + * completes after all earlier ones, so the unmap is safe.
> + */
> + while (tso_dma_map_next(&map, &dma_addr, &chunk_len,
> + &mapping_len, seg_payload)) {
> + prod = NEXT_TX(prod);
> + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)];
> + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)];
> +
> + txbd->tx_bd_haddr = cpu_to_le64(dma_addr);
> + dma_unmap_addr_set(tx_buf, mapping, dma_addr);
> + dma_unmap_len_set(tx_buf, len, 0);
> + tx_buf->skb = NULL;
> + tx_buf->is_sw_gso = 0;
> +
> + if (mapping_len) {
> + if (last_unmap_buf) {
> + dma_unmap_addr_set(last_unmap_buf,
> + mapping,
> + last_unmap_addr);
> + dma_unmap_len_set(last_unmap_buf,
> + len,
> + last_unmap_len);
> + }
> + last_unmap_addr = dma_addr;
> + last_unmap_len = mapping_len;
> + }
> + last_unmap_buf = tx_buf;
> +
> + flags = chunk_len << TX_BD_LEN_SHIFT;
> + txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
> + txbd->tx_bd_opaque = 0;
> +
> + seg_payload -= chunk_len;
> + }
> +
> + txbd->tx_bd_len_flags_type |=
> + cpu_to_le32(TX_BD_FLAGS_PACKET_END);
> +
> + prod = NEXT_TX(prod);
> + }
> +
> + if (last_unmap_buf) {
> + dma_unmap_addr_set(last_unmap_buf, mapping, last_unmap_addr);
> + dma_unmap_len_set(last_unmap_buf, len, last_unmap_len);
> + }
> +
> + txr->tx_inline_prod += num_segs;
> +
> + netdev_tx_sent_queue(txq, skb->len);
> +
> + WRITE_ONCE(txr->tx_prod, prod);
> + /* Sync BDs before doorbell */
> + wmb();
> + bnxt_db_write(bp, &txr->tx_db, prod);
> +
> + if (unlikely(bnxt_tx_avail(bp, txr) <= bp->tx_wake_thresh))
> + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
> + bp->tx_wake_thresh);
> +
> + return NETDEV_TX_OK;
> +
> +drop:
> dev_kfree_skb_any(skb);
> dev_core_stats_tx_dropped_inc(bp->dev);
> return NETDEV_TX_OK;
> --
> 2.52.0
>