Re: [PATCH 1/3] nfsd: fix cpntf publish race in nfs4_init_cp_state

From: NeilBrown

Date: Thu Jun 25 2026 - 23:47:02 EST


On Thu, 25 Jun 2026, Jeff Layton wrote:
> From: Chris Mason <clm@xxxxxxxx>
>
> nfs4_alloc_init_cpntf_state() splits IDR publication and cp_list
> linkage across two separate s2s_cp_lock critical sections. The first
> installs the new nfs4_cpntf_state into nn->s2s_cp_stateids and then
> assigns cs_type = NFS4_COPYNOTIFY_STID; the second acquires the lock
> again to list_add() the entry onto p_stid->sc_cp_list. Between the
> two, the entry is reachable by so_id with the correct cs_type, but
> cp_list is still {NULL, NULL} from kzalloc.
>
> A second NFSv4.2 request can race in that gap:
>
> CPU 0 CPU 1
> ----- -----
> nfs4_alloc_init_cpntf_state()
> nfs4_init_cp_state()
> spin_lock(&s2s_cp_lock)
> idr_alloc_cyclic()
> spin_unlock(&s2s_cp_lock)
> cps->cp_stateid.cs_type =
> NFS4_COPYNOTIFY_STID
> nfsd4_offload_cancel()
> find_async_copy() -> NULL
> manage_cpntf_state(clp!=NULL)
> spin_lock(&s2s_cp_lock)
> idr_find() -> cps
> cs_type check passes
> _free_cpntf_state_locked()
> refcount_dec_and_test
> list_del(&cps->cp_list)
> spin_lock(&s2s_cp_lock) /* {NULL,NULL} -> oops */
> list_add(&cps->cp_list, ...)
>
> The so_id returned to the client by COPY_NOTIFY is echoed back as
> cnr_stateid, so any authenticated NFSv4.2 client can drive
> OFFLOAD_CANCEL into manage_cpntf_state() against an entry still in
> the window. list_del() on a zeroed list_head dereferences NULL and
> oopses the server.
>
> Fold cs_type assignment and the list_add() onto p_stid->sc_cp_list
> into the same s2s_cp_lock critical section that runs idr_alloc_cyclic.
> A concurrent manage_cpntf_state() now either fails idr_find() (entry
> not yet visible) or observes a fully linked cp_list. Initialize
> cp_list with INIT_LIST_HEAD() after allocation, and switch
> _free_cpntf_state_locked() to list_del_init() so that any stale
> unlink on an already-unlinked entry is a benign no-op rather than a
> NULL dereference. nfs4_init_copy_state() passes NULL for p_stid and
> skips the list_add branch, preserving NFS4_COPY_STID semantics.
>
> Fixes: 624322f1adc5 ("NFSD add COPY_NOTIFY operation")
> Assisted-by: kres:claude-opus-4-7
> Signed-off-by: Chris Mason <clm@xxxxxxxx>
> ---
> fs/nfsd/nfs4state.c | 42 ++++++++++++++++++++++++++++++++----------
> 1 file changed, 32 insertions(+), 10 deletions(-)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index a4398dc861a5..b8946db3ebaa 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -944,7 +944,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla
> * Create a unique stateid_t to represent each COPY.
> */
> static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
> - unsigned char cs_type)
> + unsigned char cs_type, struct nfs4_stid *p_stid)
> {
> int new_id;
>
> @@ -954,19 +954,37 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
> idr_preload(GFP_KERNEL);
> spin_lock(&nn->s2s_cp_lock);
> new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT);
> - stid->cs_stid.si_opaque.so_id = new_id;
> - stid->cs_stid.si_generation = 1;
> + if (new_id >= 0) {
> + stid->cs_stid.si_opaque.so_id = new_id;
> + stid->cs_stid.si_generation = 1;
> + /*
> + * Publish cs_type and link onto the parent stid's
> + * sc_cp_list inside the same critical section that
> + * installed the entry into nn->s2s_cp_stateids. A
> + * concurrent manage_cpntf_state() either fails the
> + * idr_find() (entry not yet visible) or observes a
> + * fully linked cp_list, so list_del_init() in
> + * _free_cpntf_state_locked() is always well-defined.
> + */

This comment seems excessive. The explanation certainly should be in
the commit message, and is probably there already. It doesn't need to
be here too.

> + stid->cs_type = cs_type;
> + if (p_stid) {

I would rather this switched on "cs_type == NFS4_COPYNOTIFY_STID"

A substantially simpler solution would be to *not* pass cs_type to
nfs4_init_cp_state() so the type remains initialised to zero, which is
invalid.
Anything which finds that stateid will ignore it because they already
check the type.
Then callers of nfs4_init_cp_state() set the type themselves.

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f4d12dbcf97b..7a8010f450ea 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -943,8 +943,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla
/*
* Create a unique stateid_t to represent each COPY.
*/
-static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
- unsigned char cs_type)
+static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid)
{
int new_id;

@@ -960,13 +959,16 @@ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
idr_preload_end();
if (new_id < 0)
return 0;
- stid->cs_type = cs_type;
return 1;
}

int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy)
{
- return nfs4_init_cp_state(nn, &copy->cp_stateid, NFS4_COPY_STID);
+ if (nfs4_init_cp_state(nn, &copy->cp_stateid)){
+ stid->cs_type = NFS4_COPY_STID;
+ return 1;
+ }
+ return 0;
}

struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
@@ -979,10 +981,11 @@ struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
return NULL;
cps->cpntf_time = ktime_get_boottime_seconds();
refcount_set(&cps->cp_stateid.cs_count, 1);
- if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID))
+ if (!nfs4_init_cp_state(nn, &cps->cp_stateid))
goto out_free;
spin_lock(&nn->s2s_cp_lock);
list_add(&cps->cp_list, &p_stid->sc_cp_list);
+ stid->cs_type = NFS4_COPYNOTIFY_STID;
spin_unlock(&nn->s2s_cp_lock);
return cps;
out_free:


Maybe not a "Better" solution, I'm not sure. But certainly "simpler".
NeilBrown



> + struct nfs4_cpntf_state *cps =
> + container_of(stid, struct nfs4_cpntf_state,
> + cp_stateid);
> +
> + list_add(&cps->cp_list, &p_stid->sc_cp_list);
> + }
> + }
> spin_unlock(&nn->s2s_cp_lock);
> idr_preload_end();
> if (new_id < 0)
> return 0;
> - stid->cs_type = cs_type;
> return 1;
> }
>
> int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy)
> {
> - return nfs4_init_cp_state(nn, &copy->cp_stateid, NFS4_COPY_STID);
> + return nfs4_init_cp_state(nn, &copy->cp_stateid, NFS4_COPY_STID, NULL);
> }
>
> struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
> @@ -977,13 +995,17 @@ struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
> cps = kzalloc_obj(struct nfs4_cpntf_state);
> if (!cps)
> return NULL;
> + /*
> + * Initialize cp_list so any stale unlink (e.g. on an
> + * entry that never reached its parent's sc_cp_list)
> + * degrades to a benign self-unlink via list_del_init().
> + */
> + INIT_LIST_HEAD(&cps->cp_list);
> cps->cpntf_time = ktime_get_boottime_seconds();
> refcount_set(&cps->cp_stateid.cs_count, 1);
> - if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID))
> + if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID,
> + p_stid))
> goto out_free;
> - spin_lock(&nn->s2s_cp_lock);
> - list_add(&cps->cp_list, &p_stid->sc_cp_list);
> - spin_unlock(&nn->s2s_cp_lock);
> return cps;
> out_free:
> kfree(cps);
> @@ -7854,7 +7876,7 @@ _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps)
> WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID);
> if (!refcount_dec_and_test(&cps->cp_stateid.cs_count))
> return;
> - list_del(&cps->cp_list);
> + list_del_init(&cps->cp_list);
> idr_remove(&nn->s2s_cp_stateids,
> cps->cp_stateid.cs_stid.si_opaque.so_id);
> kfree(cps);
>
> --
> 2.54.0
>
>