Okay, I'll do a comparison test.
On 31/07/2023 08.00, Wei Fang wrote:
The XDP_TX feature is not supported before, and all the frames
which are deemed to do XDP_TX action actually do the XDP_DROP
action. So this patch adds the XDP_TX support to FEC driver.
I tested the performance of XDP_TX feature in XDP_DRV and XDP_SKB
modes on i.MX8MM-EVK and i.MX8MP-EVK platforms respectively, and
the test steps and results are as follows.
Step 1: Board A connects to the FEC port of the DUT and runs the
pktgen_sample03_burst_single_flow.sh script to generate and send
burst traffic to DUT. Note that the length of packet was set to
64 bytes and the procotol of packet was UDP in my test scenario.
Step 2: The DUT runs the xdp2 program to transmit received UDP
packets back out on the same port where they were received.
Below test result runs should have some more explaination, please.
(more inline code comments below)
root@imx8mmevk:~# ./xdp2 eth0
proto 17: 150326 pkt/s
proto 17: 141920 pkt/s
proto 17: 147338 pkt/s
proto 17: 140783 pkt/s
proto 17: 150400 pkt/s
proto 17: 134651 pkt/s
proto 17: 134676 pkt/s
proto 17: 134959 pkt/s
proto 17: 148152 pkt/s
proto 17: 149885 pkt/s
root@imx8mmevk:~# ./xdp2 -S eth0
proto 17: 131094 pkt/s
proto 17: 134691 pkt/s
proto 17: 138930 pkt/s
proto 17: 129347 pkt/s
proto 17: 133050 pkt/s
proto 17: 132932 pkt/s
proto 17: 136628 pkt/s
proto 17: 132964 pkt/s
proto 17: 131265 pkt/s
proto 17: 135794 pkt/s
root@imx8mpevk:~# ./xdp2 eth0
proto 17: 135817 pkt/s
proto 17: 142776 pkt/s
proto 17: 142237 pkt/s
proto 17: 135673 pkt/s
proto 17: 139508 pkt/s
proto 17: 147340 pkt/s
proto 17: 133329 pkt/s
proto 17: 141171 pkt/s
proto 17: 146917 pkt/s
proto 17: 135488 pkt/s
root@imx8mpevk:~# ./xdp2 -S eth0
proto 17: 133150 pkt/s
proto 17: 133127 pkt/s
proto 17: 133538 pkt/s
proto 17: 133094 pkt/s
proto 17: 133690 pkt/s
proto 17: 133199 pkt/s
proto 17: 133905 pkt/s
proto 17: 132908 pkt/s
proto 17: 133292 pkt/s
proto 17: 133511 pkt/s
For this driver, I would like to see a benchmark comparison between
XDP_TX and XDP_REDIRECT.
As below code does could create a situation where XDP_REDIRECT is justCould you explain why you expect XDP_TX should be faster than XDP_REDIRECT?
as fast as XDP_TX. (Note, that I expect XDP_TX to be faster than
XDP_REDIRECT.)
What's the problem if XDP_TX is as fast ad XDP_REDIRECT?
Signed-off-by: Wei Fang <wei.fang@xxxxxxx>https://lore.k/
---
V2 changes:
According to Jakub's comments, the V2 patch adds two changes.
1. Call txq_trans_cond_update() in fec_enet_xdp_tx_xmit() to avoid
tx timeout as XDP shares the queues with kernel stack.
2. Tx processing shouldn't call any XDP (or page pool) APIs if the
"budget" is 0.
V3 changes:
1. Remove the second change in V2, because this change has been
separated into another patch and it has been submmitted to the
upstream [1].
[1]
ernel.org%2Fr%2F20230725074148.2936402-1-wei.fang%40nxp.com&data=
05%7C01%7Cwei.fang%40nxp.com%7C9a2fc5bab84947e4bea608db933aa5
e9%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C638265652320
018962%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV
2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=wc
xe8nBeLS9uQrbphuNI18owgDNHJq9478V53KybWB8%3D&reserved=0
---++++++++++++++++++-----
drivers/net/ethernet/freescale/fec.h | 1 +
drivers/net/ethernet/freescale/fec_main.c | 80
2 files changed, 65 insertions(+), 16 deletions(-)b/drivers/net/ethernet/freescale/fec.h
diff --git a/drivers/net/ethernet/freescale/fec.h
index 8f1edcca96c4..f35445bddc7a 100644b/drivers/net/ethernet/freescale/fec_main.c
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -547,6 +547,7 @@ enum {
enum fec_txbuf_type {
FEC_TXBUF_T_SKB,
FEC_TXBUF_T_XDP_NDO,
+ FEC_TXBUF_T_XDP_TX,
};
struct fec_tx_buffer {
diff --git a/drivers/net/ethernet/freescale/fec_main.c
index 14d0dc7ba3c9..2068fe95504e 100644*dev)
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -75,6 +75,8 @@
static void set_multicast_list(struct net_device *ndev);
static void fec_enet_itr_coal_set(struct net_device *ndev);
+static int fec_enet_xdp_tx_xmit(struct net_device *ndev,
+ struct xdp_buff *xdp);
#define DRIVER_NAME "fec"
@@ -960,7 +962,8 @@ static void fec_enet_bd_init(struct net_device
txq->tx_buf[i].skb = NULL;queue_id, int budget)
}
} else {
- if (bdp->cbd_bufaddr)
+ if (bdp->cbd_bufaddr &&
+ txq->tx_buf[i].type == FEC_TXBUF_T_XDP_NDO)
dma_unmap_single(&fep->pdev->dev,
fec32_to_cpu(bdp->cbd_bufaddr),
fec16_to_cpu(bdp->cbd_datlen),
@@ -1423,7 +1426,8 @@ fec_enet_tx_queue(struct net_device *ndev, u16
break;queue_id, int budget)
xdpf = txq->tx_buf[index].xdp;
- if (bdp->cbd_bufaddr)
+ if (bdp->cbd_bufaddr &&
+ txq->tx_buf[index].type == FEC_TXBUF_T_XDP_NDO)
dma_unmap_single(&fep->pdev->dev,
fec32_to_cpu(bdp->cbd_bufaddr),
fec16_to_cpu(bdp->cbd_datlen),
@@ -1482,7 +1486,7 @@ fec_enet_tx_queue(struct net_device *ndev, u16
/* Free the sk buffer associated with this last transmit */*fep, struct bpf_prog *prog,
dev_kfree_skb_any(skb);
} else {
- xdp_return_frame(xdpf);
+ xdp_return_frame_rx_napi(xdpf);
txq->tx_buf[index].xdp = NULL;
/* restore default tx buffer type: FEC_TXBUF_T_SKB */
@@ -1573,11 +1577,18 @@ fec_enet_run_xdp(struct fec_enet_private
}
break;
- default:
- bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
- fallthrough;
-
case XDP_TX:
+ err = fec_enet_xdp_tx_xmit(fep->netdev, xdp);
You should pass along the "sync" length value to fec_enet_xdp_tx_xmit().
Because we know DMA comes from same device (it is already DMA mapped
to), then we can do a DMA sync "to_device" with only the sync length.
+ if (err) {
Add an unlikely(err) or do like above case XDP_REDIRECT, where it takes
the likely case "if (!err)" first.
+ ret = FEC_ENET_XDP_CONSUMED;fec_enet_private *fep, int index)
+ page = virt_to_head_page(xdp->data);
+ page_pool_put_page(rxq->page_pool, page, sync, true);
+ } else {
+ ret = FEC_ENET_XDP_TX;
+ }
+ break;
+
+ default:
bpf_warn_invalid_xdp_action(fep->netdev, prog, act);
fallthrough;
@@ -3793,7 +3804,8 @@ fec_enet_xdp_get_tx_queue(struct
static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep,
struct fec_enet_priv_tx_q *txq,
- struct xdp_frame *frame)
+ struct xdp_frame *frame,
+ bool ndo_xmit)
E.g add parameter dma_sync_len.
{fec_enet_private *fep,
unsigned int index, status, estatus;
struct bufdesc *bdp;
@@ -3813,10 +3825,24 @@ static int fec_enet_txq_xmit_frame(struct
index = fec_enet_get_bd_index(bdp, &txq->bd);
- dma_addr = dma_map_single(&fep->pdev->dev, frame->data,
- frame->len, DMA_TO_DEVICE);
- if (dma_mapping_error(&fep->pdev->dev, dma_addr))
- return -ENOMEM;
+ if (ndo_xmit) {
+ dma_addr = dma_map_single(&fep->pdev->dev, frame->data,
+ frame->len, DMA_TO_DEVICE);
+ if (dma_mapping_error(&fep->pdev->dev, dma_addr))
+ return -ENOMEM;
+
+ txq->tx_buf[index].type = FEC_TXBUF_T_XDP_NDO;
+ } else {
+ struct page *page = virt_to_page(frame->data);
+
+ dma_addr = page_pool_get_dma_addr(page) + sizeof(*frame) +
+ frame->headroom;
+ dma_sync_single_for_device(&fep->pdev->dev, dma_addr,
+ frame->len, DMA_BIDIRECTIONAL);
Optimization: use dma_sync_len here instead of frame->len.
+ txq->tx_buf[index].type = FEC_TXBUF_T_XDP_TX;fec_enet_private *fep,
+ }
+
+ txq->tx_buf[index].xdp = frame;
status |= (BD_ENET_TX_INTR | BD_ENET_TX_LAST);
if (fep->bufdesc_ex)
@@ -3835,9 +3861,6 @@ static int fec_enet_txq_xmit_frame(struct
ebdp->cbd_esc = cpu_to_fec32(estatus);before
}
- txq->tx_buf[index].type = FEC_TXBUF_T_XDP_NDO;
- txq->tx_buf[index].xdp = frame;
-
/* Make sure the updates to rest of the descriptor are performed
* transferring ownership.fec_enet_private *fep,
*/
@@ -3863,6 +3886,31 @@ static int fec_enet_txq_xmit_frame(struct
return 0;
}
+static int fec_enet_xdp_tx_xmit(struct net_device *ndev,
+ struct xdp_buff *xdp)
+{
E.g add parameter dma_sync_len.
+ struct xdp_frame *xdpf = xdp_convert_buff_to_frame(xdp);
XDP_TX can avoid this conversion to xdp_frame.
It would requires some refactor of fec_enet_txq_xmit_frame().
+ struct fec_enet_private *fep = netdev_priv(ndev);
+ struct fec_enet_priv_tx_q *txq;
+ int cpu = smp_processor_id();
+ struct netdev_queue *nq;
+ int queue, ret;
+
+ queue = fec_enet_xdp_get_tx_queue(fep, cpu);
+ txq = fep->tx_queue[queue];
+ nq = netdev_get_tx_queue(fep->netdev, queue);
+
+ __netif_tx_lock(nq, cpu);
It is sad that XDP_TX takes a lock for each frame.
+
+ /* Avoid tx timeout as XDP shares the queue with kernel stack */
+ txq_trans_cond_update(nq);
+ ret = fec_enet_txq_xmit_frame(fep, txq, xdpf, false);
Add/pass parameter dma_sync_len to fec_enet_txq_xmit_frame().
+*dev,
+ __netif_tx_unlock(nq);
+
+ return ret;
+}
+
static int fec_enet_xdp_xmit(struct net_device *dev,
int num_frames,
struct xdp_frame **frames,
@@ -3885,7 +3933,7 @@ static int fec_enet_xdp_xmit(struct net_device
/* Avoid tx timeout as XDP shares the queue with kernel stack */
txq_trans_cond_update(nq);
for (i = 0; i < num_frames; i++) {
- if (fec_enet_txq_xmit_frame(fep, txq, frames[i]) < 0)
+ if (fec_enet_txq_xmit_frame(fep, txq, frames[i], true) < 0)
break;
sent_frames++;
}