[RFC PATCH 6/6] net: ethernet: ti: am65-cpsw-nuss: Enable batch processing for TX / TX CMPL

From: Siddharth Vadapalli

Date: Wed Mar 25 2026 - 09:10:38 EST


Enable batch processing on the transmit and transmit completion paths by
submitting a batch of packet descriptors on transmit and similarly by
dequeueing a batch of packet descriptors on transmit completion.

Signed-off-by: Siddharth Vadapalli <s-vadapalli@xxxxxx>
---
drivers/net/ethernet/ti/am65-cpsw-nuss.c | 201 +++++++++++++++++++----
drivers/net/ethernet/ti/am65-cpsw-nuss.h | 12 ++
2 files changed, 178 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index fc165579a479..2b354af14cb7 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -1624,14 +1624,14 @@ static inline void am65_cpsw_nuss_xmit_recycle(struct am65_cpsw_tx_chn *tx_chn,
am65_cpsw_nuss_put_tx_desc(tx_chn, first_desc);
}

-static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
- int chn, unsigned int budget, bool *tdown)
+static int am65_cpsw_nuss_tx_cmpl_free_batch(struct am65_cpsw_common *common, int chn,
+ u32 batch_size, unsigned int budget,
+ bool *tdown)
{
bool single_port = AM65_CPSW_IS_CPSW2G(common);
enum am65_cpsw_tx_buf_type buf_type;
struct am65_cpsw_tx_swdata *swdata;
struct cppi5_host_desc_t *desc_tx;
- struct device *dev = common->dev;
struct am65_cpsw_tx_chn *tx_chn;
struct netdev_queue *netif_txq;
unsigned int total_bytes = 0;
@@ -1640,21 +1640,13 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
unsigned int pkt_len;
struct sk_buff *skb;
dma_addr_t desc_dma;
- int res, num_tx = 0;
+ int num_tx = 0, i;

tx_chn = &common->tx_chns[chn];

- while (true) {
- if (!single_port)
- spin_lock(&tx_chn->lock);
- res = k3_udma_glue_pop_tx_chn(tx_chn->tx_chn, &desc_dma);
- if (!single_port)
- spin_unlock(&tx_chn->lock);
-
- if (res == -ENODATA)
- break;
-
- if (cppi5_desc_is_tdcm(desc_dma)) {
+ for (i = 0; i < batch_size; i++) {
+ desc_dma = tx_chn->cmpl_desc_dma_array[i];
+ if (unlikely(cppi5_desc_is_tdcm(desc_dma))) {
if (atomic_dec_and_test(&common->tdown_cnt))
complete(&common->tdown_complete);
*tdown = true;
@@ -1701,7 +1693,34 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq);
}

- dev_dbg(dev, "%s:%u pkt:%d\n", __func__, chn, num_tx);
+ return num_tx;
+}
+
+static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
+ int chn, unsigned int budget, bool *tdown)
+{
+ bool single_port = AM65_CPSW_IS_CPSW2G(common);
+ struct am65_cpsw_tx_chn *tx_chn;
+ u32 batch_size = 0;
+ int res, num_tx;
+
+ tx_chn = &common->tx_chns[chn];
+
+ if (!single_port)
+ spin_lock(&tx_chn->lock);
+
+ res = k3_udma_glue_pop_tx_chn_batch(tx_chn->tx_chn, tx_chn->cmpl_desc_dma_array,
+ &batch_size, AM65_CPSW_TX_BATCH_SIZE);
+ if (!batch_size) {
+ if (!single_port)
+ spin_unlock(&tx_chn->lock);
+ return 0;
+ }
+
+ num_tx = am65_cpsw_nuss_tx_cmpl_free_batch(common, chn, batch_size, budget, tdown);
+
+ if (!single_port)
+ spin_unlock(&tx_chn->lock);

return num_tx;
}
@@ -1760,18 +1779,48 @@ static irqreturn_t am65_cpsw_nuss_tx_irq(int irq, void *dev_id)
return IRQ_HANDLED;
}

+static void am65_cpsw_nuss_submit_ndev_batch(struct am65_cpsw_common *common)
+{
+ bool single_port = AM65_CPSW_IS_CPSW2G(common);
+ struct am65_cpsw_tx_desc_batch *tx_desc_batch;
+ struct am65_cpsw_tx_chn *tx_chn;
+ int ret, i;
+
+ /* Submit packets across netdevs across TX Channels */
+ for (i = 0; i < AM65_CPSW_MAX_QUEUES; i++) {
+ if (common->tx_desc_batch[i].tx_batch_idx) {
+ tx_chn = &common->tx_chns[i];
+ tx_desc_batch = &common->tx_desc_batch[i];
+ if (!single_port)
+ spin_lock_bh(&tx_chn->lock);
+ ret = k3_udma_glue_push_tx_chn_batch(tx_chn->tx_chn,
+ tx_desc_batch->desc_tx_array,
+ tx_desc_batch->desc_dma_array,
+ tx_desc_batch->tx_batch_idx);
+ if (!single_port)
+ spin_unlock_bh(&tx_chn->lock);
+ if (ret)
+ dev_err(common->dev, "failed to push %u pkts on queue %d\n",
+ tx_desc_batch->tx_batch_idx, i);
+ tx_desc_batch->tx_batch_idx = 0;
+ }
+ }
+ atomic_set(&common->tx_batch_count, 0);
+}
+
static netdev_tx_t am65_cpsw_nuss_ndo_slave_xmit(struct sk_buff *skb,
struct net_device *ndev)
{
struct am65_cpsw_common *common = am65_ndev_to_common(ndev);
struct cppi5_host_desc_t *first_desc, *next_desc, *cur_desc;
struct am65_cpsw_port *port = am65_ndev_to_port(ndev);
+ struct am65_cpsw_tx_desc_batch *tx_desc_batch;
struct am65_cpsw_tx_swdata *swdata;
struct device *dev = common->dev;
struct am65_cpsw_tx_chn *tx_chn;
struct netdev_queue *netif_txq;
dma_addr_t desc_dma, buf_dma;
- int ret, q_idx, i;
+ int q_idx, i;
u32 *psdata;
u32 pkt_len;

@@ -1883,20 +1932,31 @@ static netdev_tx_t am65_cpsw_nuss_ndo_slave_xmit(struct sk_buff *skb,

cppi5_hdesc_set_pktlen(first_desc, pkt_len);
desc_dma = k3_cppi_desc_pool_virt2dma(tx_chn->desc_pool, first_desc);
- if (AM65_CPSW_IS_CPSW2G(common)) {
- ret = k3_udma_glue_push_tx_chn(tx_chn->tx_chn, first_desc, desc_dma);
- } else {
- spin_lock_bh(&tx_chn->lock);
- ret = k3_udma_glue_push_tx_chn(tx_chn->tx_chn, first_desc, desc_dma);
- spin_unlock_bh(&tx_chn->lock);
- }
- if (ret) {
- dev_err(dev, "can't push desc %d\n", ret);
- /* inform bql */
- netdev_tx_completed_queue(netif_txq, 1, pkt_len);
- ndev->stats.tx_errors++;
- goto err_free_descs;
- }
+
+ /* Batch processing begins */
+ spin_lock_bh(&common->tx_batch_lock);
+
+ tx_desc_batch = &common->tx_desc_batch[q_idx];
+ tx_desc_batch->desc_tx_array[tx_desc_batch->tx_batch_idx] = first_desc;
+ tx_desc_batch->desc_dma_array[tx_desc_batch->tx_batch_idx] = desc_dma;
+ tx_desc_batch->tx_batch_idx++;
+
+ /* Push the batch across all queues and all netdevs in any of the
+ * following scenarios:
+ * 1. If we reach the batch size
+ * 2. If queue is stopped
+ * 3. No more packets are expected for ndev
+ * 4. We do not have sufficient free descriptors for upcoming packets
+ * and need to push the batch to reclaim them via completion
+ */
+ if ((atomic_inc_return(&common->tx_batch_count) == AM65_CPSW_TX_BATCH_SIZE) ||
+ netif_xmit_stopped(netif_txq) ||
+ !netdev_xmit_more() ||
+ (am65_cpsw_nuss_num_free_tx_desc(tx_chn) < MAX_SKB_FRAGS))
+ am65_cpsw_nuss_submit_ndev_batch(common);
+
+ /* Batch processing ends */
+ spin_unlock_bh(&common->tx_batch_lock);

if (am65_cpsw_nuss_num_free_tx_desc(tx_chn) < MAX_SKB_FRAGS) {
netif_tx_stop_queue(netif_txq);
@@ -2121,19 +2181,88 @@ static int am65_cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
struct xdp_frame **frames, u32 flags)
{
struct am65_cpsw_common *common = am65_ndev_to_common(ndev);
+ struct am65_cpsw_port *port = am65_ndev_to_port(ndev);
+ struct am65_cpsw_tx_desc_batch *tx_desc_batch;
+ struct cppi5_host_desc_t *host_desc;
+ struct am65_cpsw_tx_swdata *swdata;
struct am65_cpsw_tx_chn *tx_chn;
struct netdev_queue *netif_txq;
+ dma_addr_t dma_desc, dma_buf;
int cpu = smp_processor_id();
- int i, nxmit = 0;
+ int i, q_idx, nxmit = 0;
+ struct xdp_frame *xdpf;
+ u32 pkt_len;

- tx_chn = &common->tx_chns[cpu % common->tx_ch_num];
+ q_idx = cpu % common->tx_ch_num;
+ tx_chn = &common->tx_chns[q_idx];
netif_txq = netdev_get_tx_queue(ndev, tx_chn->id);

__netif_tx_lock(netif_txq, cpu);
for (i = 0; i < n; i++) {
- if (am65_cpsw_xdp_tx_frame(ndev, tx_chn, frames[i],
- AM65_CPSW_TX_BUF_TYPE_XDP_NDO))
+ host_desc = am65_cpsw_nuss_get_tx_desc(tx_chn);
+ if (unlikely(!host_desc)) {
+ ndev->stats.tx_dropped++;
+ break;
+ }
+
+ xdpf = frames[i];
+ pkt_len = xdpf->len;
+
+ am65_cpsw_nuss_set_buf_type(tx_chn, host_desc, AM65_CPSW_TX_BUF_TYPE_XDP_NDO);
+
+ dma_buf = dma_map_single(tx_chn->dma_dev, xdpf->data,
+ pkt_len, DMA_TO_DEVICE);
+ if (unlikely(dma_mapping_error(tx_chn->dma_dev, dma_buf))) {
+ ndev->stats.tx_dropped++;
+ am65_cpsw_nuss_put_tx_desc(tx_chn, host_desc);
break;
+ }
+
+ cppi5_hdesc_init(host_desc, CPPI5_INFO0_HDESC_EPIB_PRESENT,
+ AM65_CPSW_NAV_PS_DATA_SIZE);
+ cppi5_hdesc_set_pkttype(host_desc, AM65_CPSW_CPPI_TX_PKT_TYPE);
+ cppi5_hdesc_set_pktlen(host_desc, pkt_len);
+ cppi5_desc_set_pktids(&host_desc->hdr, 0, AM65_CPSW_CPPI_TX_FLOW_ID);
+ cppi5_desc_set_tags_ids(&host_desc->hdr, 0, port->port_id);
+
+ k3_udma_glue_tx_dma_to_cppi5_addr(tx_chn->tx_chn, &dma_buf);
+ cppi5_hdesc_attach_buf(host_desc, dma_buf, pkt_len, dma_buf, pkt_len);
+
+ swdata = cppi5_hdesc_get_swdata(host_desc);
+ swdata->ndev = ndev;
+ swdata->xdpf = xdpf;
+
+ /* Report BQL before sending the packet */
+ netif_txq = netdev_get_tx_queue(ndev, tx_chn->id);
+ netdev_tx_sent_queue(netif_txq, pkt_len);
+
+ dma_desc = k3_cppi_desc_pool_virt2dma(tx_chn->desc_pool, host_desc);
+
+ /* Batch processing begins */
+ spin_lock_bh(&common->tx_batch_lock);
+
+ tx_desc_batch = &common->tx_desc_batch[q_idx];
+ tx_desc_batch->desc_tx_array[tx_desc_batch->tx_batch_idx] = host_desc;
+ tx_desc_batch->desc_dma_array[tx_desc_batch->tx_batch_idx] = dma_desc;
+ tx_desc_batch->tx_batch_idx++;
+
+ /* Push the batch across all queues and all netdevs in any of the
+ * following scenarios:
+ * 1. If we reach the batch size
+ * 2. If queue is stopped
+ * 3. We are at the last XDP frame in the batch
+ * 4. We do not have sufficient free descriptors for upcoming packets
+ * and need to push the batch to reclaim them via completion
+ */
+ if ((atomic_inc_return(&common->tx_batch_count) == AM65_CPSW_TX_BATCH_SIZE) ||
+ netif_xmit_stopped(netif_txq) ||
+ (i == (n - 1)) ||
+ (am65_cpsw_nuss_num_free_tx_desc(tx_chn) < MAX_SKB_FRAGS))
+ am65_cpsw_nuss_submit_ndev_batch(common);
+
+ /* Batch processing ends */
+ spin_unlock_bh(&common->tx_batch_lock);
+
nxmit++;
}
__netif_tx_unlock(netif_txq);
@@ -2497,6 +2626,8 @@ static int am65_cpsw_nuss_init_tx_chns(struct am65_cpsw_common *common)
dev_name(dev), tx_chn->id);
}

+ atomic_set(&common->tx_batch_count, 0);
+
ret = am65_cpsw_nuss_ndev_add_tx_napi(common);
if (ret) {
dev_err(dev, "Failed to add tx NAPI %d\n", ret);
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h
index e64b4cfd6f2c..81405e3bed79 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h
@@ -28,6 +28,8 @@ struct am65_cpts;
#define AM65_CPSW_MAX_TX_DESC 500
#define AM65_CPSW_MAX_RX_DESC 500

+#define AM65_CPSW_TX_BATCH_SIZE 128
+
#define AM65_CPSW_PORT_VLAN_REG_OFFSET 0x014

struct am65_cpsw_slave_data {
@@ -93,6 +95,7 @@ struct am65_cpsw_tx_chn {
struct k3_cppi_desc_pool *desc_pool;
struct k3_udma_glue_tx_channel *tx_chn;
spinlock_t lock; /* protect TX rings in multi-port mode */
+ dma_addr_t cmpl_desc_dma_array[AM65_CPSW_TX_BATCH_SIZE];
struct am65_cpsw_tx_ring tx_ring;
struct hrtimer tx_hrtimer;
unsigned long tx_pace_timeout;
@@ -165,6 +168,12 @@ struct am65_cpsw_devlink {
struct am65_cpsw_common *common;
};

+struct am65_cpsw_tx_desc_batch {
+ struct cppi5_host_desc_t *desc_tx_array[AM65_CPSW_TX_BATCH_SIZE];
+ dma_addr_t desc_dma_array[AM65_CPSW_TX_BATCH_SIZE];
+ u8 tx_batch_idx;
+};
+
struct am65_cpsw_common {
struct device *dev;
struct device *mdio_dev;
@@ -188,6 +197,9 @@ struct am65_cpsw_common {
struct am65_cpsw_tx_chn tx_chns[AM65_CPSW_MAX_QUEUES];
struct completion tdown_complete;
atomic_t tdown_cnt;
+ atomic_t tx_batch_count;
+ spinlock_t tx_batch_lock; /* protect TX batch operations */
+ struct am65_cpsw_tx_desc_batch tx_desc_batch[AM65_CPSW_MAX_QUEUES];

int rx_ch_num_flows;
struct am65_cpsw_rx_chn rx_chns;
--
2.51.1