Re: [PATCH net-next v7 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs

From: Simon Schippers

Date: Sat Jun 13 2026 - 10:15:33 EST

On 6/12/26 10:35, hawk@xxxxxxxxxx wrote:
> From: Simon Schippers <simon.schippers@xxxxxxxxxxxxxx>
>
> Per-packet BQL completion forces DQL to converge on limit=2, causing
> excessive NAPI scheduling overhead and qdisc requeues.
>
> Accumulate BQL completions and flush them when a configurable time
> threshold (tx-usecs) is exceeded, letting DQL discover a limit that
> bounds actual queuing delay to the configured interval. Coalescing
> state persists across NAPI polls in struct veth_rq so completions can
> accumulate beyond a single budget=64 cycle.
>
> The flush condition is:
>
> state->time + bql_flush_ns <= current_time || state->n_bql > dql.limit
>
> Flushing when n_bql exceeds dql.limit handles BQL starvation.
>
> The comparison is strictly greater-than because netdev_tx_sent_queue()
> always lets the producer exceed the limit by one before it stops, so
> n_bql == dql.limit is a normal in-flight state. dql.limit lives in
> the same cacheline as the completion path, so the check is cheap.
>
> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
> setting tx-usecs to 0 disables coalescing and falls back to per-packet
> completion.
>
> ethtool -C <veth-dev> tx-usecs 500 # 500us coalescing
> ethtool -C <veth-dev> tx-usecs 0 # per-packet (no coalescing)
>
> Co-developed-by: Jesper Dangaard Brouer <hawk@xxxxxxxxxx>
> Signed-off-by: Jesper Dangaard Brouer <hawk@xxxxxxxxxx>
> Co-developed-by: Jonas Köppeler <j.koeppeler@xxxxxxxxxxxx>
> Signed-off-by: Jonas Köppeler <j.koeppeler@xxxxxxxxxxxx>
> Signed-off-by: Simon Schippers <simon.schippers@xxxxxxxxxxxxxx>
> ---
> drivers/net/veth.c | 123 ++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 117 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 2473f730734b..c62d87a8402c 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -28,6 +28,7 @@
> #include <linux/bpf_trace.h>
> #include <linux/net_tstamp.h>
> #include <linux/skbuff_ref.h>
> +#include <linux/sched/clock.h>
> #include <net/page_pool/helpers.h>
>
> #define DRV_NAME "veth"
> @@ -50,6 +51,7 @@
> * delay => 64 * 250 ms = 16 s.
> */
> #define VETH_WATCHDOG_TIMEOUT_MS (64 * 250)
> +#define VETH_BQL_COAL_TX_USECS 100 /* default tx-usecs for BQL batching */
>
> struct veth_stats {
> u64 rx_drops;
> @@ -69,6 +71,11 @@ struct veth_rq_stats {
> struct u64_stats_sync syncp;
> };
>
> +struct veth_bql_state {
> + u64 time; /* sched_clock() when current coalescing window started */
> + uint n_bql; /* BQL completions batched in the current window */
> +};
> +
> struct veth_rq {
> struct napi_struct xdp_napi;
> struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
> @@ -76,6 +83,7 @@ struct veth_rq {
> struct bpf_prog __rcu *xdp_prog;
> struct xdp_mem_info xdp_mem;
> struct veth_rq_stats stats;
> + struct veth_bql_state bql_state;
> bool rx_notify_masked;
> struct ptr_ring xdp_ring;
> struct xdp_rxq_info xdp_rxq;
> @@ -88,6 +96,7 @@ struct veth_priv {
> struct bpf_prog *_xdp_prog;
> struct veth_rq *rq;
> unsigned int requested_headroom;
> + unsigned int tx_coal_usecs; /* BQL completion coalescing */
> };
>
> struct veth_xdp_tx_bq {
> @@ -272,7 +281,56 @@ static void veth_get_channels(struct net_device *dev,
> static int veth_set_channels(struct net_device *dev,
> struct ethtool_channels *ch);
>
> +static int veth_get_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack)
> +{
> + struct veth_priv *priv = netdev_priv(dev);
> +
> + ec->tx_coalesce_usecs = priv->tx_coal_usecs;
> + return 0;
> +}
> +
> +static int veth_set_coalesce(struct net_device *dev,
> + struct ethtool_coalesce *ec,
> + struct kernel_ethtool_coalesce *kernel_coal,
> + struct netlink_ext_ack *extack)
> +{
> + struct veth_priv *priv = netdev_priv(dev);
> + struct net_device *peer;
> +
> + /* The coalescing window delays BQL completions, so keep tx-usecs well
> + * below the tx_timeout watchdog; otherwise a large value could stall a
> + * stopped queue long enough to trip a false watchdog timeout. Cap at
> + * half the watchdog to leave a generous safety margin. tx-usecs is
> + * microseconds, the watchdog is milliseconds.
> + */
> + if (ec->tx_coalesce_usecs > VETH_WATCHDOG_TIMEOUT_MS / 2 * USEC_PER_MSEC) {
> + NL_SET_ERR_MSG_MOD(extack,
> + "tx-usecs must stay below half the tx_timeout watchdog");
> + return -ERANGE;
> + }
> +
> + /* Paired with READ_ONCE in veth_xdp_rcv(). */
> + WRITE_ONCE(priv->tx_coal_usecs, ec->tx_coalesce_usecs);
> +
> + /* veth_xdp_rcv() reads each device's own value, so mirror it onto
> + * the peer to keep the pair symmetric: both directions coalesce
> + * with the same tx-usecs. Called under RTNL, rtnl_dereference() is safe.
> + */
> + peer = rtnl_dereference(priv->peer);
> + if (peer) {
> + struct veth_priv *peer_priv = netdev_priv(peer);
> +
> + WRITE_ONCE(peer_priv->tx_coal_usecs, ec->tx_coalesce_usecs);
> + }
> +
> + return 0;
> +}
> +
> static const struct ethtool_ops veth_ethtool_ops = {
> + .supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
> .get_drvinfo = veth_get_drvinfo,
> .get_link = ethtool_op_get_link,
> .get_strings = veth_get_strings,
> @@ -282,6 +340,8 @@ static const struct ethtool_ops veth_ethtool_ops = {
> .get_ts_info = ethtool_op_get_ts_info,
> .get_channels = veth_get_channels,
> .set_channels = veth_set_channels,
> + .get_coalesce = veth_get_coalesce,
> + .set_coalesce = veth_set_coalesce,
> };
>
> /* general routines */
> @@ -969,13 +1029,54 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
> return NULL;
> }
>
> +static void veth_bql_maybe_complete(struct veth_bql_state *state,
> + struct netdev_queue *peer_txq,
> + u64 bql_flush_ns)
> +{
> + u64 current_time;
> +
> + /* There is no reason to complete with 0 and
> + * peer_txq could go away.
> + */
> + if (!state->n_bql || !peer_txq)
> + return;
> +
> + current_time = sched_clock();
> +
> + /* We complete if:
> + * 1. We reach bql_flush_ns.
> + * 2. We potentially have BQL starvation.
> + */
> + if (state->time + bql_flush_ns <= current_time ||
> + state->n_bql > peer_txq->dql.limit) {

Both Sashiko-Nipa and Sashiko-Gemini are right, this is missing a
#ifdef CONFIG_BQL. Not sure what is the best way to add them.
And for the struct we could maybe do:

#ifdef CONFIG_BQL
struct veth_bql_state {
u64 time; /* sched_clock() when current coalescing window started */
uint n_bql; /* BQL completions batched in the current window */
};
#else
struct veth_bql_state {};
#endif

> + netdev_tx_completed_queue(peer_txq, state->n_bql,
> + state->n_bql * VETH_BQL_UNIT);
> + state->time = current_time;
> + state->n_bql = 0;
> + }
> +}
> +
> static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> struct veth_xdp_tx_bq *bq,
> struct veth_stats *stats,
> struct netdev_queue *peer_txq)
> {
> + struct veth_priv *priv = netdev_priv(rq->dev);
> + struct veth_bql_state *state = &rq->bql_state;
> int i, done = 0, n_xdpf = 0;
> void *xdpf[VETH_XDP_BATCH];
> + u64 bql_flush_ns;
> +
> + /* Mirrored to both peers; paired with WRITE_ONCE() in veth_set_coalesce */
> + bql_flush_ns = (u64)READ_ONCE(priv->tx_coal_usecs) * 1000;
> +
> + /* Clamp stored timestamp in case we migrated to a CPU with a behind
> + * sched_clock(); tries to reduce late BQL flushes.
> + */
> + state->time = min(state->time, sched_clock());
> +
> + /* Flush completions that timed out since the previous NAPI poll. */
> + veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
>
> for (i = 0; i < budget; i++) {
> void *ptr = __ptr_ring_consume(&rq->xdp_ring);
> @@ -1000,12 +1101,11 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> }
> } else {
> /* ndo_start_xmit */
> - bool bql_charged = veth_ptr_is_bql(ptr);
> struct sk_buff *skb = veth_ptr_to_skb(ptr);
>
> + if (veth_ptr_is_bql(ptr))
> + state->n_bql++;
> stats->xdp_bytes += skb->len;
> - if (peer_txq && bql_charged)
> - netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
>
> skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
> if (skb) {
> @@ -1015,6 +1115,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
> napi_gro_receive(&rq->xdp_napi, skb);
> }
> }
> + veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
> done++;

Sashiko-Nipa reports:

"If veth_xdp_rcv() finishes and returns a done count less than the budget,
NAPI will go to sleep in veth_poll(). Do we need to unconditionally flush
any stranded BQL completions in veth_poll() before sleeping?
If completions are left in rq->bql_state indefinitely across NAPI idle
periods, it might present an artificially massive delay to DQL. This could
cause DQL to mistakenly conclude the hardware is extremely slow and
aggressively shrink dql.limit to its minimum, crippling throughput on
subsequent bursts."

Again the issue that I found to be non-problematic in [1] and can be
seen by an BQL inflight > 0 when for example pktgen suddenly stops.

If we would "unconditionally flush any stranded BQL completions in
veth_poll() before sleeping" we would *not* accumulate BQL completions
across NAPI polls but we want to do that.

Do you agree?

[1] https://lore.kernel.org/netdev/c8650d3a-e488-4279-b28f-549d766c23a1@xxxxxxxxxxxxxx/