[PATCH v2 3/9] nfsd: convert nfsd_net boolean flags to unsigned long flags word

From: Jeff Layton

Date: Sat May 30 2026 - 09:20:48 EST


From: Chris Mason <clm@xxxxxxxx>

nfsd_net contains several boolean fields that are accessed from
concurrent contexts without serialization. In particular,
nfsd4_end_grace() guards its drain path with a plain bool:

if (nn->grace_ended)
return;
nn->grace_ended = true;

The read and the write are independent, and nothing in struct
nfsd_net serializes them. At least two contexts can reach this
code with no lock held:

laundromat path
laundry_wq kworker
nfs4_laundromat()
nfsd4_end_grace()

RECLAIM_COMPLETE path
nfsd compound kthread
nfsd4_reclaim_complete()
inc_reclaim_complete()
nfsd4_end_grace()

Both callers can observe grace_ended == false on different CPUs,
both store true, and both proceed into nfsd4_record_grace_done(),
which invokes the active client_tracking_ops->grace_done callback.
For tracking ops that drain reclaim_str_hashtbl (legacy_tracking_ops
via nfsd4_recdir_purge_old, and the cld v1+ ops via
nfsd4_cld_grace_done), grace_done calls nfs4_release_reclaim(),
which walks every bucket of reclaim_str_hashtbl with no lock and
calls nfs4_remove_reclaim_record() (list_del + kfree) on each
entry. Two concurrent walkers corrupt the list and double-free
every nfs4_client_reclaim. A concurrent nfsd4_find_reclaim_client()
iterating the same bucket reads through freed memory.

A third call site exists in nfs4_state_start_net() on the
skip_grace startup path, but it runs under nfsd_mutex before any
client has connected and before the laundromat's first delayed
work fires, so it cannot race with the two callers above.

Replace the scattered boolean fields in nfsd_net with a single
unsigned long flags word and an enum nfsd_net_flag for the bit
positions. The grace_ended race is fixed by using
test_and_set_bit(), which is atomic on all architectures. The
remaining flags (grace_end_forced, in_grace, somebody_reclaimed,
track_reclaim_completes, nfsd_net_up, lockd_up) are converted to
use test_bit/set_bit/clear_bit for consistency. This avoids
sub-word cmpxchg issues on architectures like Hexagon that only
support word-sized atomic operations.

Fixes: 362063a595be ("nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld")
Assisted-by: kres:claude-opus-4-7
Reported-by: Chris Mason <clm@xxxxxxxx>
Signed-off-by: Chris Mason <clm@xxxxxxxx>
---
fs/nfsd/netns.h | 19 +++++++++++--------
fs/nfsd/nfs4proc.c | 2 +-
fs/nfsd/nfs4recover.c | 12 ++++++------
fs/nfsd/nfs4state.c | 40 ++++++++++++++++++++++++----------------
fs/nfsd/nfsctl.c | 2 +-
fs/nfsd/nfssvc.c | 22 +++++++++++-----------
6 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 27da1a3edacb..37dfecb9d49d 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -28,6 +28,16 @@ struct cld_net;
struct nfsd_net_cb;
struct nfsd4_client_tracking_ops;

+enum nfsd_net_flag {
+ NFSD_NET_GRACE_ENDED,
+ NFSD_NET_GRACE_END_FORCED,
+ NFSD_NET_IN_GRACE,
+ NFSD_NET_SOMEBODY_RECLAIMED,
+ NFSD_NET_TRACK_RECLAIM_COMPLETES,
+ NFSD_NET_UP,
+ NFSD_NET_LOCKD_UP,
+};
+
enum {
/* cache misses due only to checksum comparison failures */
NFSD_STATS_PAYLOAD_MISSES,
@@ -66,8 +76,7 @@ struct nfsd_net {
struct cache_detail *nametoid_cache;

struct lock_manager nfsd4_manager;
- bool grace_ended;
- bool grace_end_forced;
+ unsigned long flags;
time64_t boot_time;

struct dentry *nfsd_client_dir;
@@ -117,19 +126,13 @@ struct nfsd_net {
spinlock_t blocked_locks_lock;

struct file *rec_file;
- bool in_grace;
const struct nfsd4_client_tracking_ops *client_tracking_ops;

time64_t nfsd4_lease;
time64_t nfsd4_grace;
- bool somebody_reclaimed;

- bool track_reclaim_completes;
atomic_t nr_reclaim_complete;

- bool nfsd_net_up;
- bool lockd_up;
-
seqlock_t writeverf_lock;
unsigned char writeverf[8];

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5f2b9bfc3a84..9473aeb53f72 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -667,7 +667,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
pr_warn("nfsd4_process_open2 failed to open newly-created file: status=%u\n",
be32_to_cpu(status));
if (reclaim && !status)
- nn->somebody_reclaimed = true;
+ set_bit(NFSD_NET_SOMEBODY_RECLAIMED, &nn->flags);
out:
if (open->op_filp) {
fput(open->op_filp);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 6ea25a52d2f4..c841da585142 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -167,7 +167,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
end_creating(dentry);
out:
if (status == 0) {
- if (nn->in_grace)
+ if (test_bit(NFSD_NET_IN_GRACE, &nn->flags))
__nfsd4_create_reclaim_record_grace(clp, dname, nn);
vfs_fsync(nn->rec_file, 0);
} else {
@@ -317,7 +317,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
nfs4_reset_creds(original_cred);
if (status == 0) {
vfs_fsync(nn->rec_file, 0);
- if (nn->in_grace)
+ if (test_bit(NFSD_NET_IN_GRACE, &nn->flags))
__nfsd4_remove_reclaim_record_grace(dname,
HEXDIR_LEN, nn);
}
@@ -373,7 +373,7 @@ nfsd4_recdir_purge_old(struct nfsd_net *nn)
{
int status;

- nn->in_grace = false;
+ clear_bit(NFSD_NET_IN_GRACE, &nn->flags);
if (!nn->rec_file)
return;
status = mnt_want_write_file(nn->rec_file);
@@ -455,7 +455,7 @@ nfsd4_init_recdir(struct net *net)

nfs4_reset_creds(original_cred);
if (!status)
- nn->in_grace = true;
+ set_bit(NFSD_NET_IN_GRACE, &nn->flags);
return status;
}

@@ -1362,7 +1362,7 @@ nfs4_cld_state_init(struct net *net)
for (i = 0; i < CLIENT_HASH_SIZE; i++)
INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
nn->reclaim_str_hashtbl_size = 0;
- nn->track_reclaim_completes = true;
+ set_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags);
atomic_set(&nn->nr_reclaim_complete, 0);

return 0;
@@ -1373,7 +1373,7 @@ nfs4_cld_state_shutdown(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);

- nn->track_reclaim_completes = false;
+ clear_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags);
kfree(nn->reclaim_str_hashtbl);
}

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9503859918ac..bc5216bb08ff 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2777,7 +2777,7 @@ static void inc_reclaim_complete(struct nfs4_client *clp)
{
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);

- if (!nn->track_reclaim_completes)
+ if (!test_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags))
return;
if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
return;
@@ -5309,8 +5309,6 @@ nfsd4_init_leases_net(struct nfsd_net *nn)

nn->nfsd4_lease = 90; /* default lease time */
nn->nfsd4_grace = 90;
- nn->somebody_reclaimed = false;
- nn->track_reclaim_completes = false;
nn->clverifier_counter = get_random_u32();
nn->clientid_base = get_random_u32();
nn->clientid_counter = nn->clientid_base + 1;
@@ -7022,12 +7020,21 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static void
nfsd4_end_grace(struct nfsd_net *nn)
{
- /* do nothing if grace period already ended */
- if (nn->grace_ended)
+ /*
+ * nfsd4_end_grace() can be entered concurrently from the
+ * laundromat workqueue and from an nfsd compound thread
+ * handling RECLAIM_COMPLETE. Without serialization, both
+ * callers can observe grace_ended==false and proceed into
+ * nfsd4_record_grace_done(). For tracking ops whose
+ * grace_done drains reclaim_str_hashtbl, that results in
+ * list corruption and a double free of every
+ * nfs4_client_reclaim entry. Use an atomic test-and-set so
+ * exactly one caller proceeds.
+ */
+ if (test_and_set_bit(NFSD_NET_GRACE_ENDED, &nn->flags))
return;

trace_nfsd_grace_complete(nn);
- nn->grace_ended = true;
/*
* If the server goes down again right now, an NFSv4
* client will still be allowed to reclaim after it comes back up,
@@ -7068,10 +7075,10 @@ bool nfsd4_force_end_grace(struct nfsd_net *nn)
{
if (!nn->client_tracking_ops)
return false;
- if (READ_ONCE(nn->grace_ended))
+ if (test_bit(NFSD_NET_GRACE_ENDED, &nn->flags))
return false;
/* laundromat_work must be initialised now, though it might be disabled */
- WRITE_ONCE(nn->grace_end_forced, true);
+ set_bit(NFSD_NET_GRACE_END_FORCED, &nn->flags);
/* mod_delayed_work() doesn't queue work after
* nfs4_state_shutdown_net() has called disable_delayed_work_sync()
*/
@@ -7088,15 +7095,15 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)
time64_t double_grace_period_end = nn->boot_time +
2 * nn->nfsd4_lease;

- if (READ_ONCE(nn->grace_end_forced))
+ if (test_bit(NFSD_NET_GRACE_END_FORCED, &nn->flags))
return false;
- if (nn->track_reclaim_completes &&
+ if (test_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags) &&
atomic_read(&nn->nr_reclaim_complete) ==
nn->reclaim_str_hashtbl_size)
return false;
- if (!nn->somebody_reclaimed)
+ if (!test_bit(NFSD_NET_SOMEBODY_RECLAIMED, &nn->flags))
return false;
- nn->somebody_reclaimed = false;
+ clear_bit(NFSD_NET_SOMEBODY_RECLAIMED, &nn->flags);
/*
* If we've given them *two* lease times to reclaim, and they're
* still not done, give up:
@@ -8887,7 +8894,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
status = 0;
if (lock->lk_reclaim)
- nn->somebody_reclaimed = true;
+ set_bit(NFSD_NET_SOMEBODY_RECLAIMED, &nn->flags);
break;
case FILE_LOCK_DEFERRED:
kref_put(&nbl->nbl_kref, free_nbl);
@@ -9413,8 +9420,8 @@ static int nfs4_state_create_net(struct net *net)
nn->conf_name_tree = RB_ROOT;
nn->unconf_name_tree = RB_ROOT;
nn->boot_time = ktime_get_real_seconds();
- nn->grace_ended = false;
- nn->grace_end_forced = false;
+ clear_bit(NFSD_NET_GRACE_ENDED, &nn->flags);
+ clear_bit(NFSD_NET_GRACE_END_FORCED, &nn->flags);
nn->nfsd4_manager.block_opens = true;
INIT_LIST_HEAD(&nn->nfsd4_manager.list);
INIT_LIST_HEAD(&nn->client_lru);
@@ -9500,7 +9507,8 @@ nfs4_state_start_net(struct net *net)
nfsd4_client_tracking_init(net);
/* safe for laundromat to run now */
enable_delayed_work(&nn->laundromat_work);
- if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
+ if (test_bit(NFSD_NET_TRACK_RECLAIM_COMPLETES, &nn->flags) &&
+ nn->reclaim_str_hashtbl_size == 0)
goto skip_grace;
printk(KERN_INFO "NFSD: starting %lld-second grace period (net %x)\n",
nn->nfsd4_grace, net->ns.inum);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 468aad8c3af9..92f65ca6f667 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1111,7 +1111,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
}

return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
- nn->grace_ended ? 'Y' : 'N');
+ test_bit(NFSD_NET_GRACE_ENDED, &nn->flags) ? 'Y' : 'N');
}

#endif
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index be0add971c2d..551d3cf51036 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -351,7 +351,7 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int ret;

- if (nn->nfsd_net_up)
+ if (test_bit(NFSD_NET_UP, &nn->flags))
return 0;

ret = nfsd_startup_generic();
@@ -364,11 +364,11 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
goto out_socks;
}

- if (nfsd_needs_lockd(nn) && !nn->lockd_up) {
+ if (nfsd_needs_lockd(nn) && !test_bit(NFSD_NET_LOCKD_UP, &nn->flags)) {
ret = lockd_up(net, cred);
if (ret)
goto out_socks;
- nn->lockd_up = true;
+ set_bit(NFSD_NET_LOCKD_UP, &nn->flags);
}

ret = nfsd_file_cache_start_net(net);
@@ -386,7 +386,7 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
if (ret)
goto out_reply_cache;

- nn->nfsd_net_up = true;
+ set_bit(NFSD_NET_UP, &nn->flags);
return 0;

out_reply_cache:
@@ -394,9 +394,9 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
out_filecache:
nfsd_file_cache_shutdown_net(net);
out_lockd:
- if (nn->lockd_up) {
+ if (test_bit(NFSD_NET_LOCKD_UP, &nn->flags)) {
lockd_down(net);
- nn->lockd_up = false;
+ clear_bit(NFSD_NET_LOCKD_UP, &nn->flags);
}
out_socks:
nfsd_shutdown_generic();
@@ -407,7 +407,7 @@ static void nfsd_shutdown_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);

- if (nn->nfsd_net_up) {
+ if (test_bit(NFSD_NET_UP, &nn->flags)) {
percpu_ref_kill_and_confirm(&nn->nfsd_net_ref, nfsd_net_done);
wait_for_completion(&nn->nfsd_net_confirm_done);

@@ -415,18 +415,18 @@ static void nfsd_shutdown_net(struct net *net)
nfs4_state_shutdown_net(net);
nfsd_reply_cache_shutdown(nn);
nfsd_file_cache_shutdown_net(net);
- if (nn->lockd_up) {
+ if (test_bit(NFSD_NET_LOCKD_UP, &nn->flags)) {
lockd_down(net);
- nn->lockd_up = false;
+ clear_bit(NFSD_NET_LOCKD_UP, &nn->flags);
}
wait_for_completion(&nn->nfsd_net_free_done);
}

percpu_ref_exit(&nn->nfsd_net_ref);

- if (nn->nfsd_net_up)
+ if (test_bit(NFSD_NET_UP, &nn->flags))
nfsd_shutdown_generic();
- nn->nfsd_net_up = false;
+ clear_bit(NFSD_NET_UP, &nn->flags);
}

static DEFINE_SPINLOCK(nfsd_notifier_lock);

--
2.54.0