[PATCH v2 4/9] nfsd: dedup nfs4_client_to_reclaim inserts

From: Jeff Layton

Date: Sat May 30 2026 - 09:20:41 EST


nfs4_client_to_reclaim() unconditionally allocates a new
nfs4_client_reclaim, prepends it to reclaim_str_hashtbl[], and bumps
reclaim_str_hashtbl_size with no check for an existing entry for the
same client name. After a reboot with a populated recovery directory
that inflates the counter by one for every client that reclaims:

boot: load_recdir()
nfs4_client_to_reclaim(name) /* entry #1, size++ */

grace: RECLAIM_COMPLETE
__nfsd4_create_reclaim_record_grace()
nfs4_client_to_reclaim(name) /* entry #2, size++ */

inc_reclaim_complete() ends the grace period early only when

atomic_inc_return(&nn->nr_reclaim_complete) ==
nn->reclaim_str_hashtbl_size

With reclaim_str_hashtbl_size at 2N and nr_reclaim_complete capped at
N, the equality never holds and the fast end-of-grace path is dead.
The grace period always runs out the full 90-second laundromat timer,
and the shadow entry left in the hash table carries a dangling cr_clp
for any reader that walks it.

Fix nfs4_client_to_reclaim() to look the name up with
nfsd4_find_reclaim_client() first and, on a hit, fold the new
princhash into the existing record (if it lacks one) and return that
record without allocating or touching reclaim_str_hashtbl_size. On
kmemdup() failure during the fold-in, return NULL so
__cld_pipe_inprogress_downcall() surfaces -EFAULT to nfsdcld, matching
the miss-path contract.

Add an rw_semaphore (reclaim_str_hashtbl_lock) to struct nfsd_net that
serialises all access to reclaim_str_hashtbl[] and
reclaim_str_hashtbl_size. Writers (nfs4_client_to_reclaim,
nfs4_remove_reclaim_record callers) hold the write side; readers
(nfsd4_cld_check*, inc_reclaim_complete, clients_still_reclaiming,
nfs4_has_reclaimed_state, nfsd4_check_legacy_client) hold the read
side. All call sites are in sleepable context, and none is a hot
path, so the rwsem cost is negligible.

Reported-by: Chris Mason <clm@xxxxxxxx>
Fixes: 362063a595be ("nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld")
Assisted-by: kres:claude-opus-4-7
Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
---
fs/nfsd/netns.h | 6 ++++-
fs/nfsd/nfs4recover.c | 36 ++++++++++++++++++++++++------
fs/nfsd/nfs4state.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 89 insertions(+), 14 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 37dfecb9d49d..47bbd4fb42b0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -93,6 +93,7 @@ struct nfsd_net {
*/
struct list_head *reclaim_str_hashtbl;
int reclaim_str_hashtbl_size;
+ struct rw_semaphore reclaim_str_hashtbl_lock;
struct list_head *conf_id_hashtbl;
struct rb_root conf_name_tree;
struct list_head *unconf_id_hashtbl;
@@ -105,7 +106,10 @@ struct nfsd_net {
* close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
* for last close replay.
*
- * All of the above fields are protected by the client_mutex.
+ * reclaim_str_hashtbl[], reclaim_str_hashtbl_size are protected by
+ * reclaim_str_hashtbl_lock.
+ *
+ * All of the remaining fields are protected by the client_mutex.
*/
struct list_head client_lru;
struct list_head close_lru;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c841da585142..d513971fb119 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -285,10 +285,12 @@ __nfsd4_remove_reclaim_record_grace(const char *dname, int len,
return;
}
name.len = len;
+ down_write(&nn->reclaim_str_hashtbl_lock);
crp = nfsd4_find_reclaim_client(name, nn);
- kfree(name.data);
if (crp)
nfs4_remove_reclaim_record(crp, nn);
+ up_write(&nn->reclaim_str_hashtbl_lock);
+ kfree(name.data);
}

static void
@@ -484,6 +486,7 @@ nfs4_legacy_state_init(struct net *net)
for (i = 0; i < CLIENT_HASH_SIZE; i++)
INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
nn->reclaim_str_hashtbl_size = 0;
+ init_rwsem(&nn->reclaim_str_hashtbl_lock);

return 0;
}
@@ -598,13 +601,16 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
goto out_enoent;
}
name.len = HEXDIR_LEN;
+ down_read(&nn->reclaim_str_hashtbl_lock);
crp = nfsd4_find_reclaim_client(name, nn);
- kfree(name.data);
if (crp) {
set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
crp->cr_clp = clp;
- return 0;
}
+ up_read(&nn->reclaim_str_hashtbl_lock);
+ kfree(name.data);
+ if (crp)
+ return 0;

out_enoent:
return -ENOENT;
@@ -1176,6 +1182,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
return 0;

/* look for it in the reclaim hashtable otherwise */
+ down_read(&nn->reclaim_str_hashtbl_lock);
crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
if (crp)
goto found;
@@ -1191,6 +1198,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
if (!name.data) {
dprintk("%s: failed to allocate memory for name.data!\n",
__func__);
+ up_read(&nn->reclaim_str_hashtbl_lock);
return -ENOENT;
}
name.len = HEXDIR_LEN;
@@ -1201,9 +1209,11 @@ nfsd4_cld_check(struct nfs4_client *clp)

}
#endif
+ up_read(&nn->reclaim_str_hashtbl_lock);
return -ENOENT;
found:
crp->cr_clp = clp;
+ up_read(&nn->reclaim_str_hashtbl_lock);
return 0;
}

@@ -1215,6 +1225,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
struct cld_net *cn = nn->cld_net;
#endif
struct nfs4_client_reclaim *crp;
+ unsigned int princhashlen;
char *principal = NULL;

/* did we already find that this client is stable? */
@@ -1222,6 +1233,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
return 0;

/* look for it in the reclaim hashtable otherwise */
+ down_read(&nn->reclaim_str_hashtbl_lock);
crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
if (crp)
goto found;
@@ -1237,6 +1249,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
if (!name.data) {
dprintk("%s: failed to allocate memory for name.data\n",
__func__);
+ up_read(&nn->reclaim_str_hashtbl_lock);
return -ENOENT;
}
name.len = HEXDIR_LEN;
@@ -1247,23 +1260,31 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)

}
#endif
+ up_read(&nn->reclaim_str_hashtbl_lock);
return -ENOENT;
found:
- if (crp->cr_princhash.len) {
+ princhashlen = crp->cr_princhash.len;
+ if (princhashlen) {
u8 digest[SHA256_DIGEST_SIZE];
+ u8 *pdata;

if (clp->cl_cred.cr_raw_principal)
principal = clp->cl_cred.cr_raw_principal;
else if (clp->cl_cred.cr_principal)
principal = clp->cl_cred.cr_principal;
- if (principal == NULL)
+ if (principal == NULL) {
+ up_read(&nn->reclaim_str_hashtbl_lock);
return -ENOENT;
+ }
sha256(principal, strlen(principal), digest);
- if (memcmp(crp->cr_princhash.data, digest,
- crp->cr_princhash.len))
+ pdata = crp->cr_princhash.data;
+ if (memcmp(pdata, digest, princhashlen)) {
+ up_read(&nn->reclaim_str_hashtbl_lock);
return -ENOENT;
+ }
}
crp->cr_clp = clp;
+ up_read(&nn->reclaim_str_hashtbl_lock);
return 0;
}

@@ -1362,6 +1383,7 @@ nfs4_cld_state_init(struct net *net)
for (i = 0; i < CLIENT_HASH_SIZE; i++)
INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
nn->reclaim_str_hashtbl_size = 0;
+ init_rwsem(&nn->reclaim_str_hashtbl_lock);
set_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags);
atomic_set(&nn->nr_reclaim_complete, 0);

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bc5216bb08ff..5bbc1d2b964a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2779,14 +2779,21 @@ static void inc_reclaim_complete(struct nfs4_client *clp)

if (!test_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags))
return;
- if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
+
+ down_read(&nn->reclaim_str_hashtbl_lock);
+ if (!nfsd4_find_reclaim_client(clp->cl_name, nn)) {
+ up_read(&nn->reclaim_str_hashtbl_lock);
return;
+ }
if (atomic_inc_return(&nn->nr_reclaim_complete) ==
nn->reclaim_str_hashtbl_size) {
+ up_read(&nn->reclaim_str_hashtbl_lock);
printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
clp->net->ns.inum);
nfsd4_end_grace(nn);
+ return;
}
+ up_read(&nn->reclaim_str_hashtbl_lock);
}

static void expire_client(struct nfs4_client *clp)
@@ -7097,10 +7104,15 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)

if (test_bit(NFSD_NET_GRACE_END_FORCED, &nn->flags))
return false;
- if (test_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags) &&
- atomic_read(&nn->nr_reclaim_complete) ==
- nn->reclaim_str_hashtbl_size)
- return false;
+ if (test_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags)) {
+ int size;
+
+ down_read(&nn->reclaim_str_hashtbl_lock);
+ size = nn->reclaim_str_hashtbl_size;
+ up_read(&nn->reclaim_str_hashtbl_lock);
+ if (atomic_read(&nn->nr_reclaim_complete) == size)
+ return false;
+ }
if (!test_bit(NFSD_NET_SOMEBODY_RECLAIMED, &nn->flags))
return false;
clear_bit(NFSD_NET_SOMEBODY_RECLAIMED, &nn->flags);
@@ -9270,9 +9282,13 @@ bool
nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn)
{
struct nfs4_client_reclaim *crp;
+ bool found;

+ down_read(&nn->reclaim_str_hashtbl_lock);
crp = nfsd4_find_reclaim_client(name, nn);
- return (crp && crp->cr_clp);
+ found = (crp && crp->cr_clp);
+ up_read(&nn->reclaim_str_hashtbl_lock);
+ return found;
}

/*
@@ -9285,10 +9301,39 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
unsigned int strhashval;
struct nfs4_client_reclaim *crp;

+ down_write(&nn->reclaim_str_hashtbl_lock);
+
+ /*
+ * A reclaim record for this client name may already exist (for
+ * example, populated at boot from the recovery directory before
+ * an in-grace RECLAIM_COMPLETE or an nfsdcld downcall delivers
+ * the same name). Dedup here so reclaim_str_hashtbl_size stays
+ * equal to the number of distinct client names; inc_reclaim_complete
+ * relies on that equality to end the grace period via the fast path.
+ */
+ crp = nfsd4_find_reclaim_client(name, nn);
+ if (crp) {
+ if (princhash.len && crp->cr_princhash.len == 0) {
+ void *pdata = kmemdup(princhash.data, princhash.len,
+ GFP_KERNEL);
+ if (pdata) {
+ crp->cr_princhash.data = pdata;
+ crp->cr_princhash.len = princhash.len;
+ } else {
+ dprintk("%s: failed to allocate memory for princhash.data!\n",
+ __func__);
+ crp = NULL;
+ }
+ }
+ up_write(&nn->reclaim_str_hashtbl_lock);
+ return crp;
+ }
+
name.data = kmemdup(name.data, name.len, GFP_KERNEL);
if (!name.data) {
dprintk("%s: failed to allocate memory for name.data!\n",
__func__);
+ up_write(&nn->reclaim_str_hashtbl_lock);
return NULL;
}
if (princhash.len) {
@@ -9297,6 +9342,7 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
dprintk("%s: failed to allocate memory for princhash.data!\n",
__func__);
kfree(name.data);
+ up_write(&nn->reclaim_str_hashtbl_lock);
return NULL;
}
} else
@@ -9316,6 +9362,7 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
kfree(name.data);
kfree(princhash.data);
}
+ up_write(&nn->reclaim_str_hashtbl_lock);
return crp;
}

@@ -9335,6 +9382,7 @@ nfs4_release_reclaim(struct nfsd_net *nn)
struct nfs4_client_reclaim *crp = NULL;
int i;

+ down_write(&nn->reclaim_str_hashtbl_lock);
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
while (!list_empty(&nn->reclaim_str_hashtbl[i])) {
crp = list_entry(nn->reclaim_str_hashtbl[i].next,
@@ -9343,6 +9391,7 @@ nfs4_release_reclaim(struct nfsd_net *nn)
}
}
WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);
+ up_write(&nn->reclaim_str_hashtbl_lock);
}

/*

--
2.54.0