Re: [PATCH net-next v5 3/5] veth: implement Byte Queue Limits (BQL) for latency reduction

From: Simon Schippers

Date: Thu May 07 2026 - 02:56:46 EST

On 5/5/26 15:21, hawk@xxxxxxxxxx wrote:
> From: Jesper Dangaard Brouer <hawk@xxxxxxxxxx>
>
> Commit dc82a33297fc ("veth: apply qdisc backpressure on full ptr_ring to
> reduce TX drops") gave qdiscs control over veth by returning
> NETDEV_TX_BUSY when the ptr_ring is full (DRV_XOFF). That commit noted
> a known limitation: the 256-entry ptr_ring sits in front of the qdisc as
> a dark buffer, adding base latency because the qdisc has no visibility
> into how many bytes are already queued there.
>
> Add BQL support so the qdisc gets feedback and can begin shaping traffic
> before the ring fills. In testing with fq_codel, BQL reduces ping RTT
> under UDP load from ~6.61ms to ~0.36ms (18x).
>
> Charge a fixed VETH_BQL_UNIT (1) per packet rather than skb->len, so
> the DQL limit tracks packets-in-flight. Unlike a physical NIC, veth
> has no link speed -- the ptr_ring drains at CPU speed and is
> packet-indexed, not byte-indexed, so bytes are not the natural unit.
> With byte-based charging, small packets sneak many more entries into
> the ring before STACK_XOFF fires, deepening the dark buffer under
> mixed-size workloads. Testing with a concurrent min-size packet flood
> shows 3.7x ping RTT degradation with skb->len charging versus no
> change with fixed-unit charging.
>
> Charge BQL inside veth_xdp_rx() under the ptr_ring producer_lock, after
> confirming the ring is not full. The charge must precede the produce
> because the NAPI consumer can run on another CPU and complete the SKB
> the instant it becomes visible in the ring. Doing both under the same
> lock avoids a pre-charge/undo pattern -- BQL is only charged when
> produce is guaranteed to succeed.
>
> BQL is enabled only when a real qdisc is attached (guarded by
> !qdisc_txq_has_no_queue), as HARD_TX_LOCK provides serialization
> for TXQ modification like dql_queued(). For lltx devices, like veth,
> this HARD_TX_LOCK serialization isn't provided. The ptr_ring
> producer_lock provides additional serialization that would allow
> BQL to work correctly even with noqueue, though that combination
> is not currently enabled, as the netstack will drop and warn.
>
> Track per-SKB BQL state via a VETH_BQL_FLAG pointer tag in the ptr_ring
> entry. This is necessary because the qdisc can be replaced live while
> SKBs are in-flight -- each SKB must carry the charge decision made at
> enqueue time rather than re-checking the peer's qdisc at completion.
>
> Complete per-SKB in veth_xdp_rcv() rather than in bulk, so STACK_XOFF
> clears promptly when producer and consumer run on different CPUs.
>
> BQL introduces a second independent queue-stop mechanism (STACK_XOFF)
> alongside the existing DRV_XOFF (ring full). Both must be clear for
> the queue to transmit. Reset BQL state in veth_napi_del_range() after
> synchronize_net() to avoid racing with in-flight veth_poll() calls.
> Clamp the reset loop to the peer's real_num_tx_queues, since the peer
> may have fewer TX queues than the local device has RX queues (e.g. when
> veth is enslaved to a bond with XDP attached).
>
> Signed-off-by: Jesper Dangaard Brouer <hawk@xxxxxxxxxx>
> ---
> drivers/net/veth.c | 86 ++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 75 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 0cfb19b760dd..86b78900c48e 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -34,9 +34,13 @@
> #define DRV_VERSION "1.0"
>
> #define VETH_XDP_FLAG BIT(0)
> +#define VETH_BQL_FLAG BIT(1)
> #define VETH_RING_SIZE 256
> #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
>
> +/* Fixed BQL charge: DQL limit tracks packets-in-flight, not bytes */
> +#define VETH_BQL_UNIT 1
> +
> #define VETH_XDP_TX_BULK_SIZE 16
> #define VETH_XDP_BATCH 16
>
> @@ -280,6 +284,21 @@ static bool veth_is_xdp_frame(void *ptr)
> return (unsigned long)ptr & VETH_XDP_FLAG;
> }
>
> +static bool veth_ptr_is_bql(void *ptr)
> +{
> + return (unsigned long)ptr & VETH_BQL_FLAG;
> +}
> +
> +static struct sk_buff *veth_ptr_to_skb(void *ptr)
> +{
> + return (void *)((unsigned long)ptr & ~VETH_BQL_FLAG);
> +}
> +
> +static void *veth_skb_to_ptr(struct sk_buff *skb, bool bql)
> +{
> + return bql ? (void *)((unsigned long)skb | VETH_BQL_FLAG) : skb;
> +}
> +
> static struct xdp_frame *veth_ptr_to_xdp(void *ptr)
> {
> return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
> @@ -295,7 +314,7 @@ static void veth_ptr_free(void *ptr)
> if (veth_is_xdp_frame(ptr))
> xdp_return_frame(veth_ptr_to_xdp(ptr));
> else
> - kfree_skb(ptr);
> + kfree_skb(veth_ptr_to_skb(ptr));
> }
>
> static void __veth_xdp_flush(struct veth_rq *rq)
> @@ -309,19 +328,33 @@ static void __veth_xdp_flush(struct veth_rq *rq)
> }
> }
>
> -static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
> +static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb, bool do_bql,
> + struct netdev_queue *txq)
> {
> - if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb)))
> + struct ptr_ring *ring = &rq->xdp_ring;
> +
> + spin_lock(&ring->producer_lock);
> + if (unlikely(!ring->size) || __ptr_ring_full(ring)) {
> + spin_unlock(&ring->producer_lock);
> return NETDEV_TX_BUSY; /* signal qdisc layer */
> + }
> +
> + /* BQL charge before produce; consumer cannot see entry yet */
> + if (do_bql)
> + netdev_tx_sent_queue(txq, VETH_BQL_UNIT);
> +
> + __ptr_ring_produce(ring, veth_skb_to_ptr(skb, do_bql));
> + spin_unlock(&ring->producer_lock);
>
> return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */
> }
>
> static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
> - struct veth_rq *rq, bool xdp)
> + struct veth_rq *rq, bool xdp, bool do_bql,
> + struct netdev_queue *txq)
> {
> return __dev_forward_skb(dev, skb) ?: xdp ?
> - veth_xdp_rx(rq, skb) :
> + veth_xdp_rx(rq, skb, do_bql, txq) :
> __netif_rx(skb);
> }
>
> @@ -348,10 +381,11 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
> {
> struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> struct veth_rq *rq = NULL;
> - struct netdev_queue *txq;
> + struct netdev_queue *txq = NULL;
> struct net_device *rcv;
> int length = skb->len;
> bool use_napi = false;
> + bool do_bql = false;
> int ret, rxq;
>
> rcu_read_lock();
> @@ -375,8 +409,12 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
> }
>
> skb_tx_timestamp(skb);
> -
> - ret = veth_forward_skb(rcv, skb, rq, use_napi);
> + if (rxq < dev->real_num_tx_queues) {
> + txq = netdev_get_tx_queue(dev, rxq);
> + /* BQL charge happens inside veth_xdp_rx() under producer_lock */
> + do_bql = use_napi && !qdisc_txq_has_no_queue(txq);
> + }
> + ret = veth_forward_skb(rcv, skb, rq, use_napi, do_bql, txq);
> switch (ret) {
> case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */
> if (!use_napi)
> @@ -412,6 +450,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
> net_crit_ratelimited("%s(%s): Invalid return code(%d)",
> __func__, dev->name, ret);
> }
> +
> rcu_read_unlock();
>
> return ret;
> @@ -900,7 +939,8 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>
> static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> struct veth_xdp_tx_bq *bq,
> - struct veth_stats *stats)
> + struct veth_stats *stats,
> + struct netdev_queue *peer_txq)
> {
> int i, done = 0, n_xdpf = 0;
> void *xdpf[VETH_XDP_BATCH];
> @@ -928,9 +968,13 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> }
> } else {
> /* ndo_start_xmit */
> - struct sk_buff *skb = ptr;
> + bool bql_charged = veth_ptr_is_bql(ptr);
> + struct sk_buff *skb = veth_ptr_to_skb(ptr);
>
> stats->xdp_bytes += skb->len;
> + if (peer_txq && bql_charged)
> + netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);

In the discussion with Jonas [1], I left a comment explaining why I think
this doesn’t work.

I still think first that adding an option to modify the hard-coded
VETH_RING_SIZE is the way to go.

Thanks!

[1] Link: https://lore.kernel.org/netdev/e8cdba04-aa9a-45c6-9807-8274b62920df@xxxxxxxxxxxxxx/

> +
> skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
> if (skb) {
> if (skb_shared(skb) || skb_unclone(skb, GFP_ATOMIC))
> @@ -976,7 +1020,7 @@ static int veth_poll(struct napi_struct *napi, int budget)
> netdev_get_tx_queue(peer_dev, queue_idx) : NULL;
>
> xdp_set_return_frame_no_direct();
> - done = veth_xdp_rcv(rq, budget, &bq, &stats);
> + done = veth_xdp_rcv(rq, budget, &bq, &stats, peer_txq);
>
> if (stats.xdp_redirect > 0)
> xdp_do_flush();
> @@ -1074,6 +1118,7 @@ static int __veth_napi_enable(struct net_device *dev)
> static void veth_napi_del_range(struct net_device *dev, int start, int end)
> {
> struct veth_priv *priv = netdev_priv(dev);
> + struct net_device *peer;
> int i;
>
> for (i = start; i < end; i++) {
> @@ -1092,6 +1137,24 @@ static void veth_napi_del_range(struct net_device *dev, int start, int end)
> ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
> }
>
> + /* Reset BQL and wake stopped peer txqs. A concurrent veth_xmit()
> + * may have set DRV_XOFF between rcu_assign_pointer(napi, NULL) and
> + * synchronize_net(), and NAPI can no longer clear it.
> + * Only wake when the device is still up.
> + */
> + peer = rtnl_dereference(priv->peer);
> + if (peer) {
> + int peer_end = min_t(int, end, peer->real_num_tx_queues);
> +
> + for (i = start; i < peer_end; i++) {
> + struct netdev_queue *txq = netdev_get_tx_queue(peer, i);
> +
> + netdev_tx_reset_queue(txq);
> + if (netif_running(dev))
> + netif_tx_wake_queue(txq);
> + }
> + }
> +
> for (i = start; i < end; i++) {
> page_pool_destroy(priv->rq[i].page_pool);
> priv->rq[i].page_pool = NULL;
> @@ -1741,6 +1804,7 @@ static void veth_setup(struct net_device *dev)
> dev->priv_flags |= IFF_PHONY_HEADROOM;
> dev->priv_flags |= IFF_DISABLE_NETPOLL;
> dev->lltx = true;
> + dev->bql = true;
>
> dev->netdev_ops = &veth_netdev_ops;
> dev->xdp_metadata_ops = &veth_xdp_metadata_ops;