Re: [PATCH v5 10/21] nfsd: add notification handlers for dir events

From: Chuck Lever

Date: Mon Jun 08 2026 - 16:43:48 EST




On Fri, May 22, 2026, at 3:42 PM, Jeff Layton wrote:
> Add the necessary parts to accept a fsnotify callback for directory
> change event and create a CB_NOTIFY request for it. When a dir nfsd_file
> is created set a handle_event callback to handle the notification.
>
> Use that to allocate a nfsd_notify_event object and then hand off a
> reference to each delegation's CB_NOTIFY. If anything fails along the
> way, recall any affected delegations.
>
> Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>

There are some significant-looking sashiko review findings which I did
not follow up on.


> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index ea3e7deb06fa..1964a213f80e 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c
> @@ -870,21 +870,30 @@ static void nfs4_xdr_enc_cb_notify(struct
> rpc_rqst *req,
> const void *data)
> {
> const struct nfsd4_callback *cb = data;
> + struct nfsd4_cb_notify *ncn = container_of(cb, struct
> nfsd4_cb_notify, ncn_cb);
> + struct nfs4_delegation *dp = container_of(ncn, struct
> nfs4_delegation, dl_cb_notify);
> struct nfs4_cb_compound_hdr hdr = {
> .ident = 0,
> .minorversion = cb->cb_clp->cl_minorversion,
> };
> - struct CB_NOTIFY4args args = { };
> + struct CB_NOTIFY4args args;
> + __be32 *p;
>
> WARN_ON_ONCE(hdr.minorversion == 0);
>
> encode_cb_compound4args(xdr, &hdr);
> encode_cb_sequence4args(xdr, cb, &hdr);
>
> - /*
> - * FIXME: get stateid and fh from delegation. Inline the cna_changes
> - * buffer, and zero it.
> - */
> + p = xdr_reserve_space(xdr, 4);
> + *p = cpu_to_be32(OP_CB_NOTIFY);
> +
> + args.cna_stateid.seqid = dp->dl_stid.sc_stateid.si_generation;
> + memcpy(&args.cna_stateid.other, &dp->dl_stid.sc_stateid.si_opaque,
> + ARRAY_SIZE(args.cna_stateid.other));
> + args.cna_fh.len = dp->dl_stid.sc_file->fi_fhandle.fh_size;
> + args.cna_fh.data = dp->dl_stid.sc_file->fi_fhandle.fh_raw;
> + args.cna_changes.count = ncn->ncn_nf_cnt;
> + args.cna_changes.element = ncn->ncn_nf;
> WARN_ON_ONCE(!xdrgen_encode_CB_NOTIFY4args(xdr, &args));
>
> hdr.nops++;

I want to avoid the need to use xdrgen to encode the CB_NOTIFY arguments.
How about this:

+ struct nfsd4_cb_notify *ncn = container_of(cb, struct nfsd4_cb_notify, ncn_cb);
+ struct nfs4_delegation *dp = container_of(ncn, struct nfs4_delegation, dl_cb_notify);

...

+ encode_stateid4(xdr, &dp->dl_stid.sc_stateid);
+ encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle);
+ xdr_stream_encode_u32(xdr, ncn->ncn_nf_cnt);
+ for (u32 i = 0; i < ncn->ncn_nf_cnt; i++)
+ (void)xdrgen_encode_notify4(xdr, &ncn->ncn_nf[i]);

And then add a "pragma public notify4;" in nfs4_1.x .


> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index b0652c755b3b..20477144475b 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c

> @@ -3461,19 +3462,131 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
> nfs4_put_stid(&dp->dl_stid);
> }
>
> +static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
> +{
> + bool queued;
> +
> + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags))
> + return;
> +
> + /*
> + * We're assuming the state code never drops its reference
> + * without first removing the lease. Since we're in this lease
> + * callback (and since the lease code is serialized by the
> + * flc_lock) we know the server hasn't removed the lease yet, and
> + * we know it's safe to take a reference.
> + */
> + refcount_inc(&dp->dl_stid.sc_count);
> + queued = nfsd4_run_cb(&dp->dl_recall);
> + WARN_ON_ONCE(!queued);
> + if (!queued)
> + refcount_dec(&dp->dl_stid.sc_count);
> +}
> +
> +static bool
> +nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
> +{
> + struct nfsd4_cb_notify *ncn = container_of(cb, struct
> nfsd4_cb_notify, ncn_cb);
> + struct nfs4_delegation *dp = container_of(ncn, struct
> nfs4_delegation, dl_cb_notify);
> + struct nfsd_notify_event *events[NOTIFY4_EVENT_QUEUE_SIZE];
> + struct xdr_buf xdr = { .buflen = PAGE_SIZE * NOTIFY4_PAGE_ARRAY_SIZE,
> + .pages = ncn->ncn_pages };
> + struct xdr_stream stream;
> + struct nfsd_file *nf;
> + int count, i;
> + bool error = false;
> +
> + xdr_init_encode_pages(&stream, &xdr);
> +
> + spin_lock(&ncn->ncn_lock);
> + count = ncn->ncn_evt_cnt;
> +
> + /* spurious queueing? */
> + if (count == 0) {
> + spin_unlock(&ncn->ncn_lock);
> + return false;
> + }
> +
> + /* we can't keep up! */
> + if (count > NOTIFY4_EVENT_QUEUE_SIZE) {
> + spin_unlock(&ncn->ncn_lock);
> + goto out_recall;
> + }
> +
> + memcpy(events, ncn->ncn_evt, sizeof(*events) * count);
> + ncn->ncn_evt_cnt = 0;
> + spin_unlock(&ncn->ncn_lock);
> +
> + rcu_read_lock();
> + nf =
> nfsd_file_get(rcu_dereference(dp->dl_stid.sc_file->fi_deleg_file));
> + rcu_read_unlock();
> + if (!nf) {
> + for (i = 0; i < count; ++i)
> + nfsd_notify_event_put(events[i]);
> + goto out_recall;
> + }
> +
> + for (i = 0; i < count; ++i) {
> + struct nfsd_notify_event *nne = events[i];
> +
> + if (!error) {
> + u32 *maskp = (u32 *)xdr_reserve_space(&stream, sizeof(*maskp));
> + u8 *p;
> +
> + if (!maskp) {
> + error = true;
> + goto put_event;
> + }
> +
> + p = nfsd4_encode_notify_event(&stream, nne, dp, nf, maskp);
> + if (!p) {
> + pr_notice("Could not generate CB_NOTIFY from fsnotify mask 0x%x\n",
> + nne->ne_mask);
> + error = true;
> + goto put_event;
> + }
> +
> + ncn->ncn_nf[i].notify_mask.count = 1;
> + ncn->ncn_nf[i].notify_mask.element = maskp;
> + ncn->ncn_nf[i].notify_vals.data = p;
> + ncn->ncn_nf[i].notify_vals.len = (u8 *)stream.p - p;
> + }
> +put_event:
> + nfsd_notify_event_put(nne);
> + }
> + if (!error) {
> + ncn->ncn_nf_cnt = count;
> + nfsd_file_put(nf);
> + return true;
> + }
> + nfsd_file_put(nf);
> +out_recall:
> + nfsd_break_one_deleg(dp);
> + return false;
> +}
> +
> static int
> nfsd4_cb_notify_done(struct nfsd4_callback *cb,
> struct rpc_task *task)
> {
> + struct nfsd4_cb_notify *ncn = container_of(cb, struct
> nfsd4_cb_notify, ncn_cb);
> + struct nfs4_delegation *dp = container_of(ncn, struct
> nfs4_delegation, dl_cb_notify);
> +
> switch (task->tk_status) {
> case -NFS4ERR_DELAY:
> rpc_delay(task, 2 * HZ);
> return 0;
> default:
> + /* For any other hard error, recall the deleg */
> + nfsd_break_one_deleg(dp);
> + fallthrough;
> + case 0:
> return 1;
> }
> }
>
> +static void nfsd4_run_cb_notify(struct nfsd4_cb_notify *ncn);
> +
> static void
> nfsd4_cb_notify_release(struct nfsd4_callback *cb)
> {
> @@ -3482,6 +3595,9 @@ nfsd4_cb_notify_release(struct nfsd4_callback *cb)
> struct nfs4_delegation *dp =
> container_of(ncn, struct nfs4_delegation, dl_cb_notify);
>
> + /* Drain events that arrived while this callback was in flight */
> + if (ncn->ncn_evt_cnt > 0)
> + nfsd4_run_cb_notify(ncn);

The above check needs to be serialized with modification of
ncn_evt_cnt:

+ bool pending;

+ /* Drain events that arrived while this callback was in flight */
+ spin_lock(&ncn->ncn_lock);
+ pending = ncn->ncn_evt_cnt > 0;
+ spin_unlock(&ncn->ncn_lock);
+ if (pending)
+ nfsd4_run_cb_notify(ncn);


> nfs4_put_stid(&dp->dl_stid);
> }
>

> @@ -9858,3 +9954,133 @@ void nfsd_update_cmtime_attr(struct file *f,
> unsigned int flags)
> MINOR(inode->i_sb->s_dev),
> inode->i_ino, ret);
> }
> +
> +static void
> +nfsd4_run_cb_notify(struct nfsd4_cb_notify *ncn)
> +{
> + struct nfs4_delegation *dp = container_of(ncn, struct
> nfs4_delegation, dl_cb_notify);
> +
> + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ncn->ncn_cb.cb_flags))
> + return;
> +
> + if (!refcount_inc_not_zero(&dp->dl_stid.sc_count))
> + clear_bit(NFSD4_CALLBACK_RUNNING, &ncn->ncn_cb.cb_flags);
> + else
> + nfsd4_run_cb(&ncn->ncn_cb);
> +}
> +
> +static struct nfsd_notify_event *
> +alloc_nfsd_notify_event(u32 mask, const struct qstr *q, struct dentry
> *dentry,
> + struct inode *target)
> +{
> + struct nfsd_notify_event *ne;
> +
> + ne = kmalloc(sizeof(*ne) + q->len + 1, GFP_NOFS);
> + if (!ne)
> + return NULL;
> +
> + memcpy(&ne->ne_name, q->name, q->len);
> + refcount_set(&ne->ne_ref, 1);
> + ne->ne_mask = mask;
> + ne->ne_name[q->len] = '\0';
> + ne->ne_namelen = q->len;
> + ne->ne_dentry = dget(dentry);
> + ne->ne_target = target;
> + if (ne->ne_target)
> + ihold(ne->ne_target);
> + return ne;
> +}
> +
> +static bool
> +should_notify_deleg(u32 mask, struct file_lease *fl)
> +{
> + /* Don't notify the client generating the event */
> + if (nfsd_breaker_owns_lease(fl))
> + return false;
> +
> + /* Skip if this event wasn't ignored by the lease */
> + if ((mask & FS_DELETE) && !(fl->c.flc_flags & FL_IGN_DIR_DELETE))
> + return false;
> + if ((mask & FS_CREATE) && !(fl->c.flc_flags & FL_IGN_DIR_CREATE))
> + return false;
> + if ((mask & FS_RENAME) && !(fl->c.flc_flags & FL_IGN_DIR_RENAME))
> + return false;
> +
> + return true;
> +}
> +
> +static void
> +nfsd_recall_all_dir_delegs(const struct inode *dir)
> +{
> + struct file_lock_context *ctx = locks_inode_context(dir);
> + struct file_lock_core *flc;
> +
> + spin_lock(&ctx->flc_lock);
> + list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
> + struct file_lease *fl = container_of(flc, struct file_lease, c);
> +
> + if (fl->fl_lmops == &nfsd_lease_mng_ops)
> + nfsd_break_deleg_cb(fl);
> + }
> + spin_unlock(&ctx->flc_lock);
> +}
> +
> +int
> +nfsd_handle_dir_event(u32 mask, const struct inode *dir, const void
> *data,
> + int data_type, const struct qstr *name)
> +{
> + struct dentry *dentry = fsnotify_data_dentry(data, data_type);
> + struct inode *target = fsnotify_data_rename_target(data, data_type);
> + struct file_lock_context *ctx;
> + struct file_lock_core *flc;
> + struct nfsd_notify_event *evt;
> +
> + /* Normalize cross-dir rename events to create/delete */
> + if (mask & FS_MOVED_FROM) {
> + mask &= ~FS_MOVED_FROM;
> + mask |= FS_DELETE;
> + }
> + if (mask & FS_MOVED_TO) {
> + mask &= ~FS_MOVED_TO;
> + mask |= FS_CREATE;
> + }
> +

I inserted an extra check here for rename notifications:

+ /*
+ * FS_RENAME fires on the source directory even for a cross-dir
+ * rename, where the moved entry now lives under a different
+ * parent. NOTIFY4_RENAME_ENTRY describes an in-place rename, so
+ * reporting it here would advertise a name absent from this
+ * directory.
+ */
+ if ((mask & FS_RENAME) && dentry && d_inode(dentry->d_parent) != dir)
+ mask &= ~FS_RENAME;


> + /* Don't do anything if this is not an expected event */
> + if (!(mask & (FS_CREATE|FS_DELETE|FS_RENAME)))
> + return 0;
> +
> + ctx = locks_inode_context(dir);
> + if (!ctx || list_empty(&ctx->flc_lease))
> + return 0;
> +
> + evt = alloc_nfsd_notify_event(mask, name, dentry, target);
> + if (!evt) {
> + nfsd_recall_all_dir_delegs(dir);
> + return 0;
> + }
> +
> + spin_lock(&ctx->flc_lock);
> + list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
> + struct file_lease *fl = container_of(flc, struct file_lease, c);
> + struct nfs4_delegation *dp = flc->flc_owner;
> + struct nfsd4_cb_notify *ncn = &dp->dl_cb_notify;
> +

I added:

+ if (fl->fl_lmops != &nfsd_lease_mng_ops)
+ continue;

Otherwise the loop treats every lease on the inode as an nfsd delegation
unconditionally.


> + if (!should_notify_deleg(mask, fl))
> + continue;
> +
> + spin_lock(&ncn->ncn_lock);
> + if (ncn->ncn_evt_cnt >= NOTIFY4_EVENT_QUEUE_SIZE) {
> + /* We're generating notifications too fast. Recall. */
> + spin_unlock(&ncn->ncn_lock);
> + nfsd_break_deleg_cb(fl);
> + continue;
> + }
> + ncn->ncn_evt[ncn->ncn_evt_cnt++] = nfsd_notify_event_get(evt);
> + spin_unlock(&ncn->ncn_lock);
> +
> + nfsd4_run_cb_notify(ncn);
> + }
> + spin_unlock(&ctx->flc_lock);
> + nfsd_notify_event_put(evt);
> + return 0;
> +}
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index e17488a911f7..31df04675713 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4172,6 +4172,127 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp,
> struct xdr_stream *xdr,
> goto out;
> }
>
> +static bool
> +nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream
> *xdr,
> + struct dentry *dentry, struct nfs4_delegation *dp,
> + struct nfsd_file *nf, char *name, u32 namelen)
> +{
> + uint32_t *attrmask;
> +
> + /* Reserve space for attrmask */
> + attrmask = xdr_reserve_space(xdr, 3 * sizeof(uint32_t));
> + if (!attrmask)
> + return false;
> +
> + ne->ne_file.data = name;
> + ne->ne_file.len = namelen;
> + ne->ne_attrs.attrmask.element = attrmask;
> +
> + attrmask[0] = 0;
> + attrmask[1] = 0;
> + attrmask[2] = 0;
> + ne->ne_attrs.attr_vals.data = NULL;
> + ne->ne_attrs.attr_vals.len = 0;
> + ne->ne_attrs.attrmask.count = 1;
> + return true;
> +}
> +
> +/**
> + * nfsd4_encode_notify_event - encode a notify
> + * @xdr: stream to which to encode the fattr4
> + * @nne: nfsd_notify_event to encode
> + * @dp: delegation where the event occurred
> + * @nf: nfsd_file on which event occurred
> + * @notify_mask: pointer to word where notification mask should be set
> + *
> + * Encode @nne into @xdr. Returns a pointer to the start of the event,
> or NULL if
> + * the event couldn't be encoded. The appropriate bit in the
> notify_mask will also
> + * be set on success.
> + */

Nit: Let's use the usual kdoc style to describe the return value.

+ * Encode @nne into @xdr. The matching bit in @notify_mask is set on
+ * success.
+ *
+ * Return: pointer to the start of the encoded event, or NULL if the
+ * event could not be encoded.
+ */


> +u8 *nfsd4_encode_notify_event(struct xdr_stream *xdr, struct
> nfsd_notify_event *nne,
> + struct nfs4_delegation *dp, struct nfsd_file *nf,
> + u32 *notify_mask)
> +{
> + u8 *p = NULL;
> +
> + *notify_mask = 0;
> +
> + if (nne->ne_mask & FS_DELETE) {
> + struct notify_remove4 nr = { };
> +
> + if (!nfsd4_setup_notify_entry4(&nr.nrm_old_entry, xdr,
> nne->ne_dentry, dp,
> + nf, nne->ne_name, nne->ne_namelen))
> + goto out_err;
> + p = (u8 *)xdr->p;
> + if (!xdrgen_encode_notify_remove4(xdr, &nr))
> + goto out_err;
> + *notify_mask |= BIT(NOTIFY4_REMOVE_ENTRY);
> + } else if (nne->ne_mask & FS_CREATE) {
> + struct notify_add4 na = { };
> + struct notify_remove4 old = { };
> +
> + if (!nfsd4_setup_notify_entry4(&na.nad_new_entry, xdr,
> nne->ne_dentry, dp,
> + nf, nne->ne_name, nne->ne_namelen))
> + goto out_err;
> +
> + /* If a file was overwritten, report it in nad_old_entry */
> + if (nne->ne_target) {
> + if (!nfsd4_setup_notify_entry4(&old.nrm_old_entry, xdr,
> + NULL, dp, nf,
> + nne->ne_name, nne->ne_namelen))
> + goto out_err;
> + na.nad_old_entry.count = 1;
> + na.nad_old_entry.element = &old;
> + }
> +
> + p = (u8 *)xdr->p;
> + if (!xdrgen_encode_notify_add4(xdr, &na))
> + goto out_err;
> +
> + *notify_mask |= BIT(NOTIFY4_ADD_ENTRY);
> + } else if (nne->ne_mask & FS_RENAME) {
> + struct notify_rename4 nr = { };
> + struct notify_remove4 old = { };
> + struct name_snapshot n;
> + bool ret;
> +
> + /* Don't send any attributes in the old_entry since they're the same
> in new */
> + if (!nfsd4_setup_notify_entry4(&nr.nrn_old_entry.nrm_old_entry, xdr,
> + NULL, dp, nf, nne->ne_name,
> + nne->ne_namelen))
> + goto out_err;
> +
> + take_dentry_name_snapshot(&n, nne->ne_dentry);
> + ret = nfsd4_setup_notify_entry4(&nr.nrn_new_entry.nad_new_entry, xdr,
> + nne->ne_dentry, dp, nf, (char *)n.name.name,
> + n.name.len);
> +
> + /* If a file was overwritten, report it in nad_old_entry */
> + if (ret && nne->ne_target) {
> + ret = nfsd4_setup_notify_entry4(&old.nrm_old_entry, xdr,
> + NULL, dp, nf,
> + (char *)n.name.name, n.name.len);
> + if (ret) {
> + nr.nrn_new_entry.nad_old_entry.count = 1;
> + nr.nrn_new_entry.nad_old_entry.element = &old;
> + }
> + }
> +
> + if (ret) {
> + p = (u8 *)xdr->p;
> + ret = xdrgen_encode_notify_rename4(xdr, &nr);
> + }
> + release_dentry_name_snapshot(&n);
> + if (!ret)
> + goto out_err;
> + *notify_mask |= BIT(NOTIFY4_RENAME_ENTRY);
> + }
> + return p;
> +out_err:
> + pr_warn("nfsd: unable to marshal notify_rename4 to xdr stream\n");

Nit: The warning needs to match the semantics of nfsd4_encode_notify_event().
How about:

+ pr_warn("nfsd: unable to marshal notify event to xdr stream\n");


> + return NULL;
> +}
> +


--
Chuck Lever