Re: [PATCH, net-next] net: mana: Trigger VF reset/recovery on health check failure due to HWC timeout

From: Dipayaan Roy

Date: Fri Feb 27 2026 - 03:10:44 EST


On Thu, Feb 26, 2026 at 07:48:31PM +0000, Long Li wrote:
> > The GF stats periodic query is used as mechanism to monitor HWC health check.
> > If this HWC command times out, it is a strong indication that the device/SoC is in a
> > faulty state and requires recovery.
> >
> > Today, when a timeout is detected, the driver marks hwc_timeout_occurred,
> > clears cached stats, and stops rescheduling the periodic work. However, the
> > device itself is left in the same failing state.
> >
> > Extend the timeout handling path to trigger the existing MANA VF recovery
> > service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item.
> > This is expected to initiate the appropriate recovery flow by suspende resume
> > first and if it fails then trigger a bus rescan.
> >
> > This change is intentionally limited to HWC command timeouts and does not
> > trigger recovery for errors reported by the SoC as a normal command response.
> >
> > Signed-off-by: Dipayaan Roy <dipayanroy@xxxxxxxxxxxxxxxxxxx>
> > ---
> > .../net/ethernet/microsoft/mana/gdma_main.c | 14 +++-------
> > drivers/net/ethernet/microsoft/mana/mana_en.c | 28 ++++++++++++++++++-
> > include/net/mana/gdma.h | 16 +++++++++--
> > 3 files changed, 45 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 0055c231acf6..16c438d2aaa3 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -490,15 +490,9 @@ static void mana_serv_reset(struct pci_dev *pdev)
> > dev_info(&pdev->dev, "MANA reset cycle completed\n");
> >
> > out:
> > - gc->in_service = false;
> > + clear_bit(GC_IN_SERVICE, &gc->flags);
> > }
> >
> > -struct mana_serv_work {
> > - struct work_struct serv_work;
> > - struct pci_dev *pdev;
> > - enum gdma_eqe_type type;
> > -};
> > -
> > static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev)
> > {
> > switch (type) {
> > @@ -542,7 +536,7 @@ static void mana_recovery_delayed_func(struct
> > work_struct *w)
> > spin_unlock_irqrestore(&work->lock, flags); }
> >
> > -static void mana_serv_func(struct work_struct *w)
> > +void mana_serv_func(struct work_struct *w)
> > {
> > struct mana_serv_work *mns_wk;
> > struct pci_dev *pdev;
> > @@ -624,7 +618,7 @@ static void mana_gd_process_eqe(struct gdma_queue
> > *eq)
> > break;
> > }
> >
> > - if (gc->in_service) {
> > + if (test_bit(GC_IN_SERVICE, &gc->flags)) {
> > dev_info(gc->dev, "Already in service\n");
> > break;
> > }
> > @@ -641,7 +635,7 @@ static void mana_gd_process_eqe(struct gdma_queue
> > *eq)
> > }
> >
> > dev_info(gc->dev, "Start MANA service type:%d\n", type);
> > - gc->in_service = true;
> > + set_bit(GC_IN_SERVICE, &gc->flags);
> > mns_wk->pdev = to_pci_dev(gc->dev);
> > mns_wk->type = type;
> > pci_dev_get(mns_wk->pdev);
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index 91c418097284..8da574cf06f2 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -879,7 +879,7 @@ static void mana_tx_timeout(struct net_device *netdev,
> > unsigned int txqueue)
> > struct gdma_context *gc = ac->gdma_dev->gdma_context;
> >
> > /* Already in service, hence tx queue reset is not required.*/
> > - if (gc->in_service)
> > + if (test_bit(GC_IN_SERVICE, &gc->flags))
> > return;
> >
> > /* Note: If there are pending queue reset work for this port(apc), @@ -
> > 3533,6 +3533,8 @@ static void mana_gf_stats_work_handler(struct work_struct
> > *work) {
> > struct mana_context *ac =
> > container_of(to_delayed_work(work), struct mana_context,
> > gf_stats_work);
> > + struct gdma_context *gc = ac->gdma_dev->gdma_context;
> > + struct mana_serv_work *mns_wk;
> > int err;
> >
> > err = mana_query_gf_stats(ac);
> > @@ -3540,6 +3542,30 @@ static void mana_gf_stats_work_handler(struct
> > work_struct *work)
> > /* HWC timeout detected - reset stats and stop rescheduling */
> > ac->hwc_timeout_occurred = true;
> > memset(&ac->hc_stats, 0, sizeof(ac->hc_stats));
> > + dev_warn(gc->dev,
> > + "Gf stats wk handler: gf stats query timed out.\n");
> > +
> > + /* As HWC timed out, indicating a faulty HW state and needs a
> > + * reset.
> > + */
> > + if (!test_and_set_bit(GC_IN_SERVICE, &gc->flags)) {
> > + if (!try_module_get(THIS_MODULE)) {
> > + dev_info(gc->dev, "Module is unloading\n");
> > + return;
> > + }
> > +
> > + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
> > + if (!mns_wk) {
> > + module_put(THIS_MODULE);
>
> Maybe it's not necessary: check if you want to call clear_bit(GC_IN_SERVICE, &gc->flags) here?
>
yes it makes sense to clear it here.
> > + return;
> > + }
> > +
> > + mns_wk->pdev = to_pci_dev(gc->dev);
> > + mns_wk->type = GDMA_EQE_HWC_RESET_REQUEST;
> > + pci_dev_get(mns_wk->pdev);
> > + INIT_WORK(&mns_wk->serv_work, mana_serv_func);
> > + schedule_work(&mns_wk->serv_work);
> > + }
> > return;
> > }
> > schedule_delayed_work(&ac->gf_stats_work,
> > MANA_GF_STATS_PERIOD); diff --git a/include/net/mana/gdma.h
>

Regards