Re: [PATCH net-next] be2net: gather more debug info and display on a tx-timeout

From: Jakub Kicinski
Date: Sat Nov 23 2019 - 19:29:20 EST


On Fri, 22 Nov 2019 16:17:19 +0530, Kalesh A P wrote:
> From: Kalesh AP <kalesh-anakkur.purayil@xxxxxxxxxxxx>
>
> In order to start recording the last few tx wqes and
> tx completions, user has to set the msg level to non-zero
> value using "ethtool -s ethX msglvl 1"
>
> This patch does the following things:
> 1. record last 200 WQE information
> 2. record first 128 bytes of last 200 TX packets
> 3. record last 200 TX completion info
> 4. On TX timeout, log these information for debugging
>
> Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@xxxxxxxxxxxx>
> Reviewed-by: Venkat Duvvuru <venkatkumar.duvvuru@xxxxxxxxxxxx>

Please consider more modern infrastructure for these sort of dumps,
like devlink health API.

> +/* Store latest 200 occurrences */
> +#define BE_TXQ_INFO_LEN 200
> +#define PKT_DUMP_SIZE 128
> +
> +struct be_tx_pktinfo {
> + u16 head;
> + u16 tail;
> + u16 used;
> + struct be_wrb_params wqe_hdr;
> + u8 skb_data[PKT_DUMP_SIZE];
> + u32 len;
> + u32 skb_len;
> + bool valid;

nit: you could save 4B per entry if the 'valid' was after 'used'

> +};
> +
> +struct be_tx_dump_cmpl {
> + u32 info[32];
> + bool valid;
> +};
> +
> /* Structure to hold some data of interest obtained from a TX CQE */
> struct be_tx_compl_info {
> u8 status; /* Completion status */

> +void be_record_tx_cmpl(struct be_tx_obj *txo,
> + struct be_eth_tx_compl *cmpl)
> +{
> + u32 offset = txo->tx_cmpl_idx;
> + struct be_tx_dump_cmpl *cmpl_dump = &txo->cmpl_info[offset];

nit: reverse xmas tree variable is generally prefered in the networking
code, meaning the cmpl_dump should be declared first, and inited in the
code rather than in place

> + memset(cmpl_dump, 0, sizeof(*cmpl_dump));
> +
> + memcpy(&cmpl_dump->info, cmpl, sizeof(cmpl_dump->info));
> + cmpl_dump->valid = 1;
> +
> + txo->tx_cmpl_idx = ((txo->tx_cmpl_idx + 1) % BE_TXQ_INFO_LEN);

outer parens unnecesary

> +}
> +
> +void be_record_tx_wqes(struct be_tx_obj *txo,
> + struct be_wrb_params *wrb_params,
> + struct sk_buff *skb)
> +{
> + u32 offset = txo->tx_wqe_offset;
> + struct be_tx_pktinfo *pkt_info = &txo->tx_pktinfo[offset];
> +
> + memset(pkt_info, 0, sizeof(*pkt_info));
> +
> + pkt_info->tail = txo->q.tail;
> + pkt_info->head = txo->q.head;
> + pkt_info->used = atomic_read(&txo->q.used);
> + pkt_info->valid = 1;

> @@ -1417,6 +1458,75 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev)
> return NETDEV_TX_OK;
> }
>
> +void
> +be_print_tx_wqes(struct be_adapter *adapter, struct be_tx_obj *txo)
> +{
> + struct device *dev = &adapter->pdev->dev;
> + struct be_tx_pktinfo *pkt_info;
> + u8 *data;
> + int i, j;
> +
> + dev_info(dev, "Dumping WQES of TXQ id %d\n", txo->q.id);
> +
> + for (i = 0; i < BE_TXQ_INFO_LEN; i++) {
> + pkt_info = &txo->tx_pktinfo[i];
> + if (!pkt_info->valid)
> + continue;
> +
> + dev_info(dev, "TXQ head %d tail %d used %d\n",
> + pkt_info->head, pkt_info->tail, pkt_info->used);
> +
> + dev_info(dev, "WRB params: feature:0x%x vlan_tag:0x%x lso_mss:0x%x\n",
> + pkt_info->wqe_hdr.features, pkt_info->wqe_hdr.vlan_tag,
> + pkt_info->wqe_hdr.lso_mss);
> +
> + dev_info(dev, "SKB len: %d\n", pkt_info->skb_len);
> + data = pkt_info->skb_data;
> + for (j = 0 ; j < pkt_info->len; j++) {
> + printk("%02x ", data[j]);

Please use something like print_hex_dump().

> + if (j % 8 == 7)
> + printk(KERN_INFO "\n");
> + }
> + }
> +}
> +
> +void
> +be_print_tx_cmpls(struct be_adapter *adapter, struct be_tx_obj *txo)
> +{
> + struct device *dev = &adapter->pdev->dev;
> + struct be_tx_dump_cmpl *cmpl_info;
> + int i;
> +
> + dev_info(dev, "TX CQ id %d head %d tail %d used %d\n",
> + txo->cq.id, txo->cq.head, txo->cq.tail,
> + atomic_read(&txo->cq.used));
> +
> + for (i = 0; i < BE_TXQ_INFO_LEN; i++) {
> + cmpl_info = &txo->cmpl_info[i];
> + if (!cmpl_info->valid)
> + continue;
> +
> + printk(KERN_INFO "0x%x 0x%x 0x%x 0x%x\n",
> + cmpl_info->info[0], cmpl_info->info[1],
> + cmpl_info->info[2], cmpl_info->info[3]);

Some functions use printk() some dev_info(), can is there a well
thought out reason for this?

> + }
> +}
> +
> +/* be_dump_info - Print tx-wqes, tx-cmpls and skb-data */
> +void be_dump_info(struct be_adapter *adapter)

Most if not all functions you add need to be static. Please make sure
your code builds cleanly with W=1 C=1.

> +{
> + struct be_tx_obj *txo;
> + int i;
> +
> + if (!adapter->msg_enable)
> + return;

nit: you're a little inconsistent with whether the caller or the callee
checks the msg_enable.

> + for_all_tx_queues(adapter, txo, i) {
> + be_print_tx_wqes(adapter, txo);
> + be_print_tx_cmpls(adapter, txo);
> + }
> +}
> +
> static void be_tx_timeout(struct net_device *netdev)
> {
> struct be_adapter *adapter = netdev_priv(netdev);
> @@ -1429,6 +1539,8 @@ static void be_tx_timeout(struct net_device *netdev)
> int status;
> int i, j;
>
> + be_dump_info(adapter);
> +
> for_all_tx_queues(adapter, txo, i) {
> dev_info(dev, "TXQ Dump: %d H: %d T: %d used: %d, qid: 0x%x\n",
> i, txo->q.head, txo->q.tail,
> @@ -2719,6 +2831,10 @@ static struct be_tx_compl_info *be_tx_compl_get(struct be_adapter *adapter,
> rmb();
> be_dws_le_to_cpu(compl, sizeof(*compl));
>
> + /* Dump completion info */
> + if (adapter->msg_enable)
> + be_record_tx_cmpl(txo, compl);
> +
> txcp->status = GET_TX_COMPL_BITS(status, compl);
> txcp->end_index = GET_TX_COMPL_BITS(wrb_index, compl);
>