Re: [PATCH 1/1] NFS: pnfs: cache data server entries for a short grace period

From: Trond Myklebust

Date: Thu Apr 16 2026 - 13:46:35 EST


On Thu, 2026-04-16 at 07:18 +0000, Lei Lei2 Yin wrote:
> From c0ce7c997fdd383964b729c3398770a9aaaaa178 Mon Sep 17 00:00:00
> 2001
> From: Lei Yin <yinlei2@xxxxxxxxxx>
> Date: Thu, 16 Apr 2026 06:42:50 +0000
> Subject: [PATCH 1/1] NFS: pnfs: cache data server entries for a short
> grace period
>
> pNFS data server cache entries are currently destroyed as soon as the
> last reference is dropped. Workloads that repeatedly open and close
> the
> same files can end up tearing down and rebuilding the same data
> server
> cache entries over and over, even when the reuse interval is very
> short.
>
> Keep a data server cache entry alive for a short grace period after
> the
> last put by deferring destruction to delayed work. If the same data
> server is looked up again before the timeout expires, cancel the
> pending
> destroy and reuse the existing cache entry.
>
> This avoids repeated cache churn in open/close-heavy test workloads
> and
> helps reduce the extra read/write latency that comes from repeatedly
> recreating data server state on short reuse intervals.
>
> Because destruction is now deferred, reap scheduled destroy work
> during
> netns teardown before checking that the data server cache is empty.
> This
> prevents pending destroy work from outliving the netns exit path.
>
> Signed-off-by: Lei Yin <yinlei2@xxxxxxxxxx>
> ---
>  fs/nfs/client.c   |  1 +
>  fs/nfs/pnfs.h     |  3 +++
>  fs/nfs/pnfs_nfs.c | 57 +++++++++++++++++++++++++++++++++++++++++++--
> --
>  3 files changed, 57 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfs/client.c b/fs/nfs/client.c
> index be02bb227741..65ab219ea206 100644
> --- a/fs/nfs/client.c
> +++ b/fs/nfs/client.c
> @@ -1287,6 +1287,7 @@ void nfs_clients_exit(struct net *net)
>         WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
>         WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
>  #if IS_ENABLED(CONFIG_NFS_V4)
> +       nfs4_pnfs_ds_client_exit(net);
>         WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache));
>  #endif /* CONFIG_NFS_V4 */
>  }
> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
> index eb39859c216c..8c198d577465 100644
> --- a/fs/nfs/pnfs.h
> +++ b/fs/nfs/pnfs.h
> @@ -64,7 +64,9 @@ struct nfs4_pnfs_ds {
>         struct nfs_client       *ds_clp;
>         refcount_t              ds_count;
>         unsigned long           ds_state;
> +       struct delayed_work     ds_destroy_work;
>  #define NFS4DS_CONNECTING      0       /* ds is establishing
> connection */
> +#define NFS4DS_DESTROY_SCHED   1       /* delayed destroy has been
> scheduled */
>  };
>  
>  struct pnfs_layout_segment {
> @@ -415,6 +417,7 @@ int pnfs_generic_commit_pagelist(struct inode
> *inode,
>  int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
> int max);
>  void pnfs_generic_write_commit_done(struct rpc_task *task, void
> *data);
>  void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
> +void nfs4_pnfs_ds_client_exit(struct net *net);
>  struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net,
>                                       struct list_head *dsaddrs,
>                                       gfp_t gfp_flags);
> diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
> index 12632a706da8..79a96e5264b8 100644
> --- a/fs/nfs/pnfs_nfs.c
> +++ b/fs/nfs/pnfs_nfs.c
> @@ -653,18 +653,65 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)
>         kfree(ds);
>  }
>  
> +static void nfs4_pnfs_ds_destroy_workfn(struct work_struct *work)
> +{
> +       struct nfs4_pnfs_ds *ds = container_of(to_delayed_work(work),
> +                                              struct nfs4_pnfs_ds,
> +                                              ds_destroy_work);
> +       struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id);
> +       bool destroy = false;
> +
> +       spin_lock(&nn->nfs4_data_server_lock);
> +       if (test_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state) &&
> +           refcount_read(&ds->ds_count) == 1) {
> +               clear_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state);
> +               list_del_init(&ds->ds_node);
> +               destroy = true;
> +       }
> +       spin_unlock(&nn->nfs4_data_server_lock);
> +
> +       if (destroy)
> +               destroy_ds(ds);
> +}
> +
>  void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
>  {
>         struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id);
>  
>         if (refcount_dec_and_lock(&ds->ds_count, &nn-
> >nfs4_data_server_lock)) {
> -               list_del_init(&ds->ds_node);
> +               /* Hold one temporary reference during the destroy
> grace window. */
> +               refcount_set(&ds->ds_count, 1);
> +               set_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state);
> +               mod_delayed_work(system_wq, &ds->ds_destroy_work, 10
> * HZ);
>                 spin_unlock(&nn->nfs4_data_server_lock);
> -               destroy_ds(ds);
>         }
>  }
>  EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
>  
> +void nfs4_pnfs_ds_client_exit(struct net *net)
> +{
> +       struct nfs_net *nn = net_generic(net, nfs_net_id);
> +       struct nfs4_pnfs_ds *ds, *tmp;
> +       LIST_HEAD(reap);
> +
> +       spin_lock(&nn->nfs4_data_server_lock);
> +       list_for_each_entry_safe(ds, tmp, &nn-
> >nfs4_data_server_cache, ds_node) {
> +               if (!test_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state) ||
> +                   refcount_read(&ds->ds_count) != 1)
> +                       continue;
> +               clear_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state);
> +               list_move(&ds->ds_node, &reap);
> +       }
> +       spin_unlock(&nn->nfs4_data_server_lock);
> +
> +       list_for_each_entry_safe(ds, tmp, &reap, ds_node) {
> +               cancel_delayed_work_sync(&ds->ds_destroy_work);
> +               list_del_init(&ds->ds_node);
> +               destroy_ds(ds);
> +       }
> +}
> +EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_client_exit);
> +
>  /*
>   * Create a string with a human readable address and port to avoid
>   * complicated setup around many dprinks.
> @@ -745,6 +792,7 @@ nfs4_pnfs_ds_add(const struct net *net, struct
> list_head *dsaddrs, gfp_t gfp_fla
>                 ds->ds_remotestr = remotestr;
>                 refcount_set(&ds->ds_count, 1);
>                 INIT_LIST_HEAD(&ds->ds_node);
> +               INIT_DELAYED_WORK(&ds->ds_destroy_work,
> nfs4_pnfs_ds_destroy_workfn);
>                 ds->ds_net = net;
>                 ds->ds_clp = NULL;
>                 list_add(&ds->ds_node, &nn->nfs4_data_server_cache);
> @@ -753,8 +801,9 @@ nfs4_pnfs_ds_add(const struct net *net, struct
> list_head *dsaddrs, gfp_t gfp_fla
>         } else {
>                 kfree(remotestr);
>                 kfree(ds);
> -               refcount_inc(&tmp_ds->ds_count);
> -               dprintk("%s data server %s found, inc'ed ds_count to
> %d\n",
> +               if (!test_and_clear_bit(NFS4DS_DESTROY_SCHED,
> &tmp_ds->ds_state))
> +                       refcount_inc(&tmp_ds->ds_count);
> +               dprintk("%s data server %s found, ds_count now %d\n",
>                         __func__, tmp_ds->ds_remotestr,
>                         refcount_read(&tmp_ds->ds_count));
>                 ds = tmp_ds;

NACK. This approach is the same as was presented a few months ago on
this list. As stated at the time, it does not agree with the
description of deviceid behaviour in RFC8881 section 12.2.10.

If you want to cache entries on the client when not holding a layout,
then you need to request a deviceinfo notification so that you can
detect changes.

--
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trondmy@xxxxxxxxxx, trond.myklebust@xxxxxxxxxxxxxxx