[PATCH 1/1] NFS: pnfs: cache data server entries for a short grace= period

From: Lei Yin

Date: Thu Apr 16 2026 - 02:42:50 EST


pNFS data server cache entries are currently destroyed as soon as the
last reference is dropped. Workloads that repeatedly open and close the
same files can end up tearing down and rebuilding the same data server
cache entries over and over, even when the reuse interval is very short.

Keep a data server cache entry alive for a short grace period after the
last put by deferring destruction to delayed work. If the same data
server is looked up again before the timeout expires, cancel the pending
destroy and reuse the existing cache entry.

This avoids repeated cache churn in open/close-heavy test workloads and
helps reduce the extra read/write latency that comes from repeatedly
recreating data server state on short reuse intervals.

Because destruction is now deferred, reap scheduled destroy work during
netns teardown before checking that the data server cache is empty. This
prevents pending destroy work from outliving the netns exit path.

Signed-off-by: Lei Yin <yinlei2@xxxxxxxxxx>
---
fs/nfs/client.c | 1 +
fs/nfs/pnfs.h | 3 +++
fs/nfs/pnfs_nfs.c | 57 +++++++++++++++++++++++++++++++++++++++++++----
3 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index be02bb227741..65ab219ea206 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1287,6 +1287,7 @@ void nfs_clients_exit(struct net *net)
WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
#if IS_ENABLED(CONFIG_NFS_V4)
+ nfs4_pnfs_ds_client_exit(net);
WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache));
#endif /* CONFIG_NFS_V4 */
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index eb39859c216c..8c198d577465 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -64,7 +64,9 @@ struct nfs4_pnfs_ds {
struct nfs_client *ds_clp;
refcount_t ds_count;
unsigned long ds_state;
+ struct delayed_work ds_destroy_work;
#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
+#define NFS4DS_DESTROY_SCHED 1 /* delayed destroy has been schedul=
ed */
};
=20
struct pnfs_layout_segment {
@@ -415,6 +417,7 @@ int pnfs_generic_commit_pagelist(struct inode *inode,
int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max)=
;
void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
+void nfs4_pnfs_ds_client_exit(struct net *net);
struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net,
struct list_head *dsaddrs,
gfp_t gfp_flags);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 12632a706da8..79a96e5264b8 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -653,18 +653,65 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)
kfree(ds);
}
=20
+static void nfs4_pnfs_ds_destroy_workfn(struct work_struct *work)
+{
+ struct nfs4_pnfs_ds *ds =3D container_of(to_delayed_work(work),
+ struct nfs4_pnfs_ds,
+ ds_destroy_work);
+ struct nfs_net *nn =3D net_generic(ds->ds_net, nfs_net_id);
+ bool destroy =3D false;
+
+ spin_lock(&nn->nfs4_data_server_lock);
+ if (test_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state) &&
+ refcount_read(&ds->ds_count) =3D=3D 1) {
+ clear_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state);
+ list_del_init(&ds->ds_node);
+ destroy =3D true;
+ }
+ spin_unlock(&nn->nfs4_data_server_lock);
+
+ if (destroy)
+ destroy_ds(ds);
+}
+
void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
{
struct nfs_net *nn =3D net_generic(ds->ds_net, nfs_net_id);
=20
if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock=
)) {
- list_del_init(&ds->ds_node);
+ /* Hold one temporary reference during the destroy grace wi=
ndow. */
+ refcount_set(&ds->ds_count, 1);
+ set_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state);
+ mod_delayed_work(system_wq, &ds->ds_destroy_work, 10 * HZ);
spin_unlock(&nn->nfs4_data_server_lock);
- destroy_ds(ds);
}
}
EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
=20
+void nfs4_pnfs_ds_client_exit(struct net *net)
+{
+ struct nfs_net *nn =3D net_generic(net, nfs_net_id);
+ struct nfs4_pnfs_ds *ds, *tmp;
+ LIST_HEAD(reap);
+
+ spin_lock(&nn->nfs4_data_server_lock);
+ list_for_each_entry_safe(ds, tmp, &nn->nfs4_data_server_cache, ds_n=
ode) {
+ if (!test_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state) ||
+ refcount_read(&ds->ds_count) !=3D 1)
+ continue;
+ clear_bit(NFS4DS_DESTROY_SCHED, &ds->ds_state);
+ list_move(&ds->ds_node, &reap);
+ }
+ spin_unlock(&nn->nfs4_data_server_lock);
+
+ list_for_each_entry_safe(ds, tmp, &reap, ds_node) {
+ cancel_delayed_work_sync(&ds->ds_destroy_work);
+ list_del_init(&ds->ds_node);
+ destroy_ds(ds);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_client_exit);
+
/*
* Create a string with a human readable address and port to avoid
* complicated setup around many dprinks.
@@ -745,6 +792,7 @@ nfs4_pnfs_ds_add(const struct net *net, struct list_hea=
d *dsaddrs, gfp_t gfp_fla
ds->ds_remotestr =3D remotestr;
refcount_set(&ds->ds_count, 1);
INIT_LIST_HEAD(&ds->ds_node);
+ INIT_DELAYED_WORK(&ds->ds_destroy_work, nfs4_pnfs_ds_destro=
y_workfn);
ds->ds_net =3D net;
ds->ds_clp =3D NULL;
list_add(&ds->ds_node, &nn->nfs4_data_server_cache);
@@ -753,8 +801,9 @@ nfs4_pnfs_ds_add(const struct net *net, struct list_hea=
d *dsaddrs, gfp_t gfp_fla
} else {
kfree(remotestr);
kfree(ds);
- refcount_inc(&tmp_ds->ds_count);
- dprintk("%s data server %s found, inc'ed ds_count to %d\n",
+ if (!test_and_clear_bit(NFS4DS_DESTROY_SCHED, &tmp_ds->ds_s=
tate))
+ refcount_inc(&tmp_ds->ds_count);
+ dprintk("%s data server %s found, ds_count now %d\n",
__func__, tmp_ds->ds_remotestr,
refcount_read(&tmp_ds->ds_count));
ds =3D tmp_ds;
--=20
2.43.0