Re: [PATCH] pnfs/filelayout: handle data server op_status errors

From: Trond Myklebust
Date: Tue Dec 09 2025 - 10:35:12 EST


On Tue, 2025-12-09 at 13:56 +0000, Robert Milkowski wrote:
> Data servers can return NFS-level status in op_status, but the file
> layout
> driver only looked at RPC transport errors. That means session
> errors,
> layout-invalidating statuses, and retry hints from the DS can be
> ignored,
> leading to missed session recovery, stale layouts, or failed retries.

The task->tk_status can carry NFS level status if there was no RPC
error. That's why we distinguish between task->tk_status and task-
>tk_rpc_status (the latter being guaranteed to only carry RPC errors).

IOW: is there any evidence of what you call out above?

>
> Pass the DS op_status into the async error handler and handle the
> same set
> of NFS status codes as flexfiles (see commit 38074de35b01,
> "NFSv4/flexfiles: Fix handling of NFS level errors in I/O"). Wire the
> read/write/commit callbacks to propagate op_status so the file layout
> driver
> can invalidate layouts, trigger session recovery, or retry as
> appropriate.
>
> Signed-off-by: Robert Milkowski <rmilkowski@xxxxxxxxx>
> ---
>  fs/nfs/filelayout/filelayout.c | 54
> ++++++++++++++++++++++++++++++++--
>  1 file changed, 51 insertions(+), 3 deletions(-)
>
> diff --git a/fs/nfs/filelayout/filelayout.c
> b/fs/nfs/filelayout/filelayout.c
> index 5c4551117c58..2808baa19f83 100644
> --- a/fs/nfs/filelayout/filelayout.c
> +++ b/fs/nfs/filelayout/filelayout.c
> @@ -121,6 +121,7 @@ static void filelayout_reset_read(struct
> nfs_pgio_header *hdr)
>  }
>  
>  static int filelayout_async_handle_error(struct rpc_task *task,
> + u32 op_status,
>   struct nfs4_state *state,
>   struct nfs_client *clp,
>   struct pnfs_layout_segment
> *lseg)
> @@ -130,6 +131,48 @@ static int filelayout_async_handle_error(struct
> rpc_task *task,
>   struct nfs4_deviceid_node *devid =
> FILELAYOUT_DEVID_NODE(lseg);
>   struct nfs4_slot_table *tbl = &clp->cl_session-
> >fc_slot_table;
>  
> + if (op_status) {
> + switch (op_status) {
> + case NFS4_OK:
> + case NFS4ERR_NXIO:
> + break;
> + case NFS4ERR_BADSESSION:
> + case NFS4ERR_BADSLOT:
> + case NFS4ERR_BAD_HIGH_SLOT:
> + case NFS4ERR_DEADSESSION:
> + case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
> + case NFS4ERR_SEQ_FALSE_RETRY:
> + case NFS4ERR_SEQ_MISORDERED:
> + dprintk("%s op_status %u, Reset session.
> Exchangeid "
> + "flags 0x%x\n", __func__, op_status,
> + clp->cl_exchange_flags);
> + nfs4_schedule_session_recovery(clp-
> >cl_session,
> +       op_status);
> + goto out_retry;
> + case NFS4ERR_DELAY:
> + case NFS4ERR_GRACE:
> + case NFS4ERR_RETRY_UNCACHED_REP:
> + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
> + goto out_retry;
> + case NFS4ERR_ACCESS:
> + case NFS4ERR_PNFS_NO_LAYOUT:
> + case NFS4ERR_STALE:
> + case NFS4ERR_BADHANDLE:
> + case NFS4ERR_ISDIR:
> + case NFS4ERR_FHEXPIRED:
> + case NFS4ERR_WRONG_TYPE:
> + case NFS4ERR_NOMATCHING_LAYOUT:
> + case NFSERR_PERM:
> + dprintk("%s Invalid layout op_status %u\n",
> __func__,
> + op_status);
> + pnfs_destroy_layout(NFS_I(inode));
> + rpc_wake_up(&tbl->slot_tbl_waitq);
> + goto reset;
> + default:
> + goto reset;
> + }
> + }
> +
>   if (task->tk_status >= 0)
>   return 0;
>  
> @@ -196,6 +239,8 @@ static int filelayout_async_handle_error(struct
> rpc_task *task,
>   task->tk_status);
>   return -NFS4ERR_RESET_TO_MDS;
>   }
> +
> +out_retry:
>   task->tk_status = 0;
>   return -EAGAIN;
>  }
> @@ -208,7 +253,8 @@ static int filelayout_read_done_cb(struct
> rpc_task *task,
>   int err;
>  
>   trace_nfs4_pnfs_read(hdr, task->tk_status);
> - err = filelayout_async_handle_error(task, hdr->args.context-
> >state,
> + err = filelayout_async_handle_error(task, hdr-
> >res.op_status,
> +     hdr->args.context-
> >state,
>       hdr->ds_clp, hdr->lseg);
>  
>   switch (err) {
> @@ -318,7 +364,8 @@ static int filelayout_write_done_cb(struct
> rpc_task *task,
>   int err;
>  
>   trace_nfs4_pnfs_write(hdr, task->tk_status);
> - err = filelayout_async_handle_error(task, hdr->args.context-
> >state,
> + err = filelayout_async_handle_error(task, hdr-
> >res.op_status,
> +     hdr->args.context-
> >state,
>       hdr->ds_clp, hdr->lseg);
>  
>   switch (err) {
> @@ -346,7 +393,8 @@ static int filelayout_commit_done_cb(struct
> rpc_task *task,
>   int err;
>  
>   trace_nfs4_pnfs_commit_ds(data, task->tk_status);
> - err = filelayout_async_handle_error(task, NULL, data-
> >ds_clp,
> + err = filelayout_async_handle_error(task, data-
> >res.op_status,
> +     NULL, data->ds_clp,
>       data->lseg);
>  
>   switch (err) {
>
> base-commit: cb015814f8b6eebcbb8e46e111d108892c5e6821

I'd be willing to take something like the above as a cleanup, assuming
that it replaces the existing handling of NFSv4.1 errors using task-
>tk_status instead of just adding new cases.

However I'd want to see evidence that the resulting patch has been
thoroughly tested before submission, because I currently have no way to
test it myself.

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trondmy@xxxxxxxxxx, trond.myklebust@xxxxxxxxxxxxxxx