RE: [PATCH, net-next] net: mana: Trigger VF reset/recovery on health check failure due to HWC timeout

From: Long Li

Date: Thu Feb 26 2026 - 14:48:51 EST


> The GF stats periodic query is used as mechanism to monitor HWC health check.
> If this HWC command times out, it is a strong indication that the device/SoC is in a
> faulty state and requires recovery.
>
> Today, when a timeout is detected, the driver marks hwc_timeout_occurred,
> clears cached stats, and stops rescheduling the periodic work. However, the
> device itself is left in the same failing state.
>
> Extend the timeout handling path to trigger the existing MANA VF recovery
> service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item.
> This is expected to initiate the appropriate recovery flow by suspende resume
> first and if it fails then trigger a bus rescan.
>
> This change is intentionally limited to HWC command timeouts and does not
> trigger recovery for errors reported by the SoC as a normal command response.
>
> Signed-off-by: Dipayaan Roy <dipayanroy@xxxxxxxxxxxxxxxxxxx>
> ---
> .../net/ethernet/microsoft/mana/gdma_main.c | 14 +++-------
> drivers/net/ethernet/microsoft/mana/mana_en.c | 28 ++++++++++++++++++-
> include/net/mana/gdma.h | 16 +++++++++--
> 3 files changed, 45 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 0055c231acf6..16c438d2aaa3 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -490,15 +490,9 @@ static void mana_serv_reset(struct pci_dev *pdev)
> dev_info(&pdev->dev, "MANA reset cycle completed\n");
>
> out:
> - gc->in_service = false;
> + clear_bit(GC_IN_SERVICE, &gc->flags);
> }
>
> -struct mana_serv_work {
> - struct work_struct serv_work;
> - struct pci_dev *pdev;
> - enum gdma_eqe_type type;
> -};
> -
> static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev)
> {
> switch (type) {
> @@ -542,7 +536,7 @@ static void mana_recovery_delayed_func(struct
> work_struct *w)
> spin_unlock_irqrestore(&work->lock, flags); }
>
> -static void mana_serv_func(struct work_struct *w)
> +void mana_serv_func(struct work_struct *w)
> {
> struct mana_serv_work *mns_wk;
> struct pci_dev *pdev;
> @@ -624,7 +618,7 @@ static void mana_gd_process_eqe(struct gdma_queue
> *eq)
> break;
> }
>
> - if (gc->in_service) {
> + if (test_bit(GC_IN_SERVICE, &gc->flags)) {
> dev_info(gc->dev, "Already in service\n");
> break;
> }
> @@ -641,7 +635,7 @@ static void mana_gd_process_eqe(struct gdma_queue
> *eq)
> }
>
> dev_info(gc->dev, "Start MANA service type:%d\n", type);
> - gc->in_service = true;
> + set_bit(GC_IN_SERVICE, &gc->flags);
> mns_wk->pdev = to_pci_dev(gc->dev);
> mns_wk->type = type;
> pci_dev_get(mns_wk->pdev);
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 91c418097284..8da574cf06f2 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -879,7 +879,7 @@ static void mana_tx_timeout(struct net_device *netdev,
> unsigned int txqueue)
> struct gdma_context *gc = ac->gdma_dev->gdma_context;
>
> /* Already in service, hence tx queue reset is not required.*/
> - if (gc->in_service)
> + if (test_bit(GC_IN_SERVICE, &gc->flags))
> return;
>
> /* Note: If there are pending queue reset work for this port(apc), @@ -
> 3533,6 +3533,8 @@ static void mana_gf_stats_work_handler(struct work_struct
> *work) {
> struct mana_context *ac =
> container_of(to_delayed_work(work), struct mana_context,
> gf_stats_work);
> + struct gdma_context *gc = ac->gdma_dev->gdma_context;
> + struct mana_serv_work *mns_wk;
> int err;
>
> err = mana_query_gf_stats(ac);
> @@ -3540,6 +3542,30 @@ static void mana_gf_stats_work_handler(struct
> work_struct *work)
> /* HWC timeout detected - reset stats and stop rescheduling */
> ac->hwc_timeout_occurred = true;
> memset(&ac->hc_stats, 0, sizeof(ac->hc_stats));
> + dev_warn(gc->dev,
> + "Gf stats wk handler: gf stats query timed out.\n");
> +
> + /* As HWC timed out, indicating a faulty HW state and needs a
> + * reset.
> + */
> + if (!test_and_set_bit(GC_IN_SERVICE, &gc->flags)) {
> + if (!try_module_get(THIS_MODULE)) {
> + dev_info(gc->dev, "Module is unloading\n");
> + return;
> + }
> +
> + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
> + if (!mns_wk) {
> + module_put(THIS_MODULE);

Maybe it's not necessary: check if you want to call clear_bit(GC_IN_SERVICE, &gc->flags) here?

> + return;
> + }
> +
> + mns_wk->pdev = to_pci_dev(gc->dev);
> + mns_wk->type = GDMA_EQE_HWC_RESET_REQUEST;
> + pci_dev_get(mns_wk->pdev);
> + INIT_WORK(&mns_wk->serv_work, mana_serv_func);
> + schedule_work(&mns_wk->serv_work);
> + }
> return;
> }
> schedule_delayed_work(&ac->gf_stats_work,
> MANA_GF_STATS_PERIOD); diff --git a/include/net/mana/gdma.h