Re: [RFC net-next v3 2/2] net/mlx5e: Add per queue netdev-genl stats

From: Tariq Toukan
Date: Sun Jun 02 2024 - 05:14:43 EST




On 29/05/2024 6:16, Joe Damato wrote:
Add functions to support the netdev-genl per queue stats API.

./cli.py --spec netlink/specs/netdev.yaml \
--dump qstats-get --json '{"scope": "queue"}'

...snip

{'ifindex': 7,
'queue-id': 62,
'queue-type': 'rx',
'rx-alloc-fail': 0,
'rx-bytes': 105965251,
'rx-packets': 179790},
{'ifindex': 7,
'queue-id': 0,
'queue-type': 'tx',
'tx-bytes': 9402665,
'tx-packets': 17551},

...snip

Also tested with the script tools/testing/selftests/drivers/net/stats.py
in several scenarios to ensure stats tallying was correct:

- on boot (default queue counts)
- adjusting queue count up or down (ethtool -L eth0 combined ...)
- adding mqprio TCs

Please test also with interface down.


Signed-off-by: Joe Damato <jdamato@xxxxxxxxxx>
---
.../net/ethernet/mellanox/mlx5/core/en_main.c | 132 ++++++++++++++++++
1 file changed, 132 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ce15805ad55a..515c16a88a6c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -39,6 +39,7 @@
#include <linux/debugfs.h>
#include <linux/if_bridge.h>
#include <linux/filter.h>
+#include <net/netdev_queues.h>
#include <net/page_pool/types.h>
#include <net/pkt_sched.h>
#include <net/xdp_sock_drv.h>
@@ -5293,6 +5294,136 @@ static bool mlx5e_tunnel_any_tx_proto_supported(struct mlx5_core_dev *mdev)
return (mlx5_vxlan_allowed(mdev->vxlan) || mlx5_geneve_tx_allowed(mdev));
}
+static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
+ struct netdev_queue_stats_rx *stats)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_channel_stats *channel_stats;
+ struct mlx5e_rq_stats *xskrq_stats;
+ struct mlx5e_rq_stats *rq_stats;
+
+ if (mlx5e_is_uplink_rep(priv))
+ return;
+
+ channel_stats = priv->channel_stats[i];
+ xskrq_stats = &channel_stats->xskrq;
+ rq_stats = &channel_stats->rq;
+
+ stats->packets = rq_stats->packets + xskrq_stats->packets;
+ stats->bytes = rq_stats->bytes + xskrq_stats->bytes;
+ stats->alloc_fail = rq_stats->buff_alloc_err +
+ xskrq_stats->buff_alloc_err;
+}
+
+static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
+ struct netdev_queue_stats_tx *stats)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5e_channel_stats *channel_stats;
+ struct mlx5e_sq_stats *sq_stats;
+ int ch_ix, tc_ix;
+
+ mutex_lock(&priv->state_lock);
+ txq_ix_to_chtc_ix(&priv->channels.params, i, &ch_ix, &tc_ix);
+ mutex_unlock(&priv->state_lock);
+
+ channel_stats = priv->channel_stats[ch_ix];
+ sq_stats = &channel_stats->sq[tc_ix];
+
+ stats->packets = sq_stats->packets;
+ stats->bytes = sq_stats->bytes;
+}
+
+static void mlx5e_get_base_stats(struct net_device *dev,
+ struct netdev_queue_stats_rx *rx,
+ struct netdev_queue_stats_tx *tx)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ int i, j;
+
+ if (!mlx5e_is_uplink_rep(priv)) {
+ rx->packets = 0;
+ rx->bytes = 0;
+ rx->alloc_fail = 0;
+
+ /* compute stats for deactivated RX queues
+ *
+ * if priv->channels.num == 0 the device is down, so compute
+ * stats for every queue.
+ *
+ * otherwise, compute only the queues which have been deactivated.
+ */
+ mutex_lock(&priv->state_lock);
+ if (priv->channels.num == 0)
+ i = 0;

This is not consistent with the above implementation of mlx5e_get_queue_stats_rx(), which always returns the stats even if the channel is down.
This way, you'll double count the down channels.

I think you should always start from priv->channels.params.num_channels.

+ else
+ i = priv->channels.params.num_channels;
+ mutex_unlock(&priv->state_lock);

I understand that you're following the guidelines by taking the lock here, I just don't think this improves anything... If channels can be modified in between calls to mlx5e_get_base_stats / mlx5e_get_queue_stats_rx, then wrapping the priv->channels access with a lock can help protect each single deref, but not necessarily in giving a consistent "screenshot" of the stats.

The rtnl_lock should take care of that, as the driver holds it when changing the number of channels and updating the real_numrx/tx_queues.

This said, I would carefully say you can drop the mutex once following the requested changes above.

+
+ for (; i < priv->stats_nch; i++) {
+ struct netdev_queue_stats_rx rx_i = {0};
+
+ mlx5e_get_queue_stats_rx(dev, i, &rx_i);
+
+ rx->packets += rx_i.packets;
+ rx->bytes += rx_i.bytes;
+ rx->alloc_fail += rx_i.alloc_fail;
+ }
+
+ if (priv->rx_ptp_opened) {
+ struct mlx5e_rq_stats *rq_stats = &priv->ptp_stats.rq;
+
+ rx->packets += rq_stats->packets;
+ rx->bytes += rq_stats->bytes;
+ }
+ }
+
+ tx->packets = 0;
+ tx->bytes = 0;
+
+ mutex_lock(&priv->state_lock);
+ for (i = 0; i < priv->stats_nch; i++) {
+ struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
+
+ /* while iterating through all channels [0, stats_nch], there
+ * are two cases to handle:
+ *
+ * 1. the channel is available, so sum only the unavailable TCs
+ * [mlx5e_get_dcb_num_tc, max_opened_tc).
+ *
+ * 2. the channel is unavailable, so sum all TCs [0, max_opened_tc).
+ */

I wonder why not call the local var 'tc'?

+ if (i < priv->channels.params.num_channels) {
+ j = mlx5e_get_dcb_num_tc(&priv->channels.params);
+ } else {
+ j = 0;
+ }

Remove parenthesis, or use ternary op.

+
+ for (; j < priv->max_opened_tc; j++) {
+ struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
+
+ tx->packets += sq_stats->packets;
+ tx->bytes += sq_stats->bytes;
+ }
+ }
+ mutex_unlock(&priv->state_lock);
+

Same comment regarding dropping the mutex.

+ if (priv->tx_ptp_opened) {
+ for (j = 0; j < priv->max_opened_tc; j++) {
+ struct mlx5e_sq_stats *sq_stats = &priv->ptp_stats.sq[j];
+
+ tx->packets += sq_stats->packets;
+ tx->bytes += sq_stats->bytes;
+ }
+ }
+}
+
+static const struct netdev_stat_ops mlx5e_stat_ops = {
+ .get_queue_stats_rx = mlx5e_get_queue_stats_rx,
+ .get_queue_stats_tx = mlx5e_get_queue_stats_tx,
+ .get_base_stats = mlx5e_get_base_stats,
+};
+
static void mlx5e_build_nic_netdev(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -5310,6 +5441,7 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
netdev->watchdog_timeo = 15 * HZ;
+ netdev->stat_ops = &mlx5e_stat_ops;
netdev->ethtool_ops = &mlx5e_ethtool_ops;
netdev->vlan_features |= NETIF_F_SG;