Re: [RFC PATCH 28/35] netfs: Adjust group handling

From: Viacheslav Dubeyko
Date: Wed Mar 19 2025 - 14:58:13 EST


On Thu, 2025-03-13 at 23:33 +0000, David Howells wrote:
> Make some adjustments to the handling of netfs groups so that ceph can use
> them for snap contexts:
>
> - Move netfs_get_group(), netfs_put_group() and netfs_put_group_many() to
> linux/netfs.h so that ceph can build its snap context on netfs groups.
>
> - Move netfs_set_group() and __netfs_set_group() to linux/netfs.h so that
> ceph_dirty_folio() can call them from inside of the locked section in
> which it finds the snap context to attach.
>
> - Provide a netfs_writepages_group() that takes a group as a parameter and
> attaches it to the request and make netfs_free_request() drop the ref on
> it. netfs_writepages() then becomes a wrapper that passes in a NULL
> group.
>
> - In netfs_perform_write(), only consider a folio to have a conflicting
> group if the folio's group pointer isn't NULL and if the folio is dirty.
>
> - In netfs_perform_write(), interject a small 10ms sleep after every 16
> attempts to flush a folio within a single call.
>
> Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
> cc: Jeff Layton <jlayton@xxxxxxxxxx>
> cc: Viacheslav Dubeyko <slava@xxxxxxxxxxx>
> cc: Alex Markuze <amarkuze@xxxxxxxxxx>
> cc: Ilya Dryomov <idryomov@xxxxxxxxx>
> cc: ceph-devel@xxxxxxxxxxxxxxx
> cc: linux-fsdevel@xxxxxxxxxxxxxxx
> ---
> fs/netfs/buffered_write.c | 25 ++++-------------
> fs/netfs/internal.h | 32 ---------------------
> fs/netfs/objects.c | 1 +
> fs/netfs/write_issue.c | 38 +++++++++++++++++++++----
> include/linux/netfs.h | 59 +++++++++++++++++++++++++++++++++++++++
> 5 files changed, 98 insertions(+), 57 deletions(-)
>
> diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
> index 0245449b93e3..12ddbe9bc78b 100644
> --- a/fs/netfs/buffered_write.c
> +++ b/fs/netfs/buffered_write.c
> @@ -11,26 +11,9 @@
> #include <linux/pagemap.h>
> #include <linux/slab.h>
> #include <linux/pagevec.h>
> +#include <linux/delay.h>
> #include "internal.h"
>
> -static void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
> -{
> - if (netfs_group)
> - folio_attach_private(folio, netfs_get_group(netfs_group));
> -}
> -
> -static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
> -{
> - void *priv = folio_get_private(folio);
> -
> - if (unlikely(priv != netfs_group)) {
> - if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
> - folio_attach_private(folio, netfs_get_group(netfs_group));
> - else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
> - folio_detach_private(folio);
> - }
> -}
> -
> /*
> * Grab a folio for writing and lock it. Attempt to allocate as large a folio
> * as possible to hold as much of the remaining length as possible in one go.
> @@ -113,6 +96,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
> };
> struct netfs_io_request *wreq = NULL;
> struct folio *folio = NULL, *writethrough = NULL;
> + unsigned int flush_counter = 0;
> unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
> ssize_t written = 0, ret, ret2;
> loff_t i_size, pos = iocb->ki_pos;
> @@ -208,7 +192,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
> group = netfs_folio_group(folio);
>
> if (unlikely(group != netfs_group) &&
> - group != NETFS_FOLIO_COPY_TO_CACHE)
> + group != NETFS_FOLIO_COPY_TO_CACHE &&
> + (group || folio_test_dirty(folio)))

I am trying to follow to this complex condition. Is it possible case that folio
is dirty but we don't flush the content?

> goto flush_content;
>
> if (folio_test_uptodate(folio)) {
> @@ -341,6 +326,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
> trace_netfs_folio(folio, netfs_flush_content);
> folio_unlock(folio);
> folio_put(folio);
> + if ((++flush_counter & 0xf) == 0xf)
> + msleep(10);

Do we really need to use sleep? And why is it 10 ms? And even if we would like
to use sleep, then it is better to introduce the named constant. And what is teh
justification for 10 ms?

> ret = filemap_write_and_wait_range(mapping, fpos, fpos + flen - 1);
> if (ret < 0)
> goto error_folio_unlock;
> diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
> index eebb4f0f660e..2a6123c4da35 100644
> --- a/fs/netfs/internal.h
> +++ b/fs/netfs/internal.h
> @@ -261,38 +261,6 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
> #endif
> }
>
> -/*
> - * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap).
> - */
> -static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
> -{
> - if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE)
> - refcount_inc(&netfs_group->ref);
> - return netfs_group;
> -}
> -
> -/*
> - * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
> - */
> -static inline void netfs_put_group(struct netfs_group *netfs_group)
> -{
> - if (netfs_group &&
> - netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
> - refcount_dec_and_test(&netfs_group->ref))
> - netfs_group->free(netfs_group);
> -}
> -
> -/*
> - * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
> - */
> -static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
> -{
> - if (netfs_group &&
> - netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
> - refcount_sub_and_test(nr, &netfs_group->ref))
> - netfs_group->free(netfs_group);
> -}
> -
> /*
> * Check to see if a buffer aligns with the crypto block size. If it doesn't
> * the crypto layer is going to copy all the data - in which case relying on
> diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
> index 52d6fce70837..7fdbaa5c5cab 100644
> --- a/fs/netfs/objects.c
> +++ b/fs/netfs/objects.c
> @@ -153,6 +153,7 @@ static void netfs_free_request(struct work_struct *work)
> kvfree(rreq->direct_bv);
> }
>
> + netfs_put_group(rreq->group);
> rolling_buffer_clear(&rreq->buffer);
> rolling_buffer_clear(&rreq->bounce);
> if (test_bit(NETFS_RREQ_PUT_RMW_TAIL, &rreq->flags))
> diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
> index 93601033ba08..3921fcf4f859 100644
> --- a/fs/netfs/write_issue.c
> +++ b/fs/netfs/write_issue.c
> @@ -418,7 +418,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
> netfs_issue_write(wreq, upload);
> } else if (fgroup != wreq->group) {
> /* We can't write this page to the server yet. */
> - kdebug("wrong group");
> + kdebug("wrong group %px != %px", fgroup, wreq->group);

I believe to use the %px is not very good practice. Do we really need to show
the real pointer?

> folio_redirty_for_writepage(wbc, folio);
> folio_unlock(folio);
> netfs_issue_write(wreq, upload);
> @@ -593,11 +593,19 @@ static void netfs_end_issue_write(struct netfs_io_request *wreq)
> netfs_wake_write_collector(wreq, false);
> }
>
> -/*
> - * Write some of the pending data back to the server
> +/**
> + * netfs_writepages_group - Flush data from the pagecache for a file
> + * @mapping: The file to flush from
> + * @wbc: Details of what should be flushed
> + * @group: The write grouping to flush (or NULL)
> + *
> + * Start asynchronous write back operations to flush dirty data belonging to a
> + * particular group in a file's pagecache back to the server and to the local
> + * cache.
> */
> -int netfs_writepages(struct address_space *mapping,
> - struct writeback_control *wbc)
> +int netfs_writepages_group(struct address_space *mapping,
> + struct writeback_control *wbc,
> + struct netfs_group *group)
> {
> struct netfs_inode *ictx = netfs_inode(mapping->host);
> struct netfs_io_request *wreq = NULL;
> @@ -618,12 +626,15 @@ int netfs_writepages(struct address_space *mapping,
> if (!folio)
> goto out;
>
> - wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK);
> + wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio),
> + NETFS_WRITEBACK);
> if (IS_ERR(wreq)) {
> error = PTR_ERR(wreq);
> goto couldnt_start;
> }
>
> + wreq->group = netfs_get_group(group);
> +
> trace_netfs_write(wreq, netfs_write_trace_writeback);
> netfs_stat(&netfs_n_wh_writepages);
>
> @@ -659,6 +670,21 @@ int netfs_writepages(struct address_space *mapping,
> _leave(" = %d", error);
> return error;
> }
> +EXPORT_SYMBOL(netfs_writepages_group);
> +
> +/**
> + * netfs_writepages - Flush data from the pagecache for a file
> + * @mapping: The file to flush from
> + * @wbc: Details of what should be flushed
> + *
> + * Start asynchronous write back operations to flush dirty data in a file's
> + * pagecache back to the server and to the local cache.
> + */
> +int netfs_writepages(struct address_space *mapping,
> + struct writeback_control *wbc)
> +{
> + return netfs_writepages_group(mapping, wbc, NULL);
> +}
> EXPORT_SYMBOL(netfs_writepages);
>
> /*
> diff --git a/include/linux/netfs.h b/include/linux/netfs.h
> index a67297de8a20..69052ac47ab1 100644
> --- a/include/linux/netfs.h
> +++ b/include/linux/netfs.h
> @@ -457,6 +457,9 @@ int netfs_read_folio(struct file *, struct folio *);
> int netfs_write_begin(struct netfs_inode *, struct file *,
> struct address_space *, loff_t pos, unsigned int len,
> struct folio **, void **fsdata);
> +int netfs_writepages_group(struct address_space *mapping,
> + struct writeback_control *wbc,
> + struct netfs_group *group);
> int netfs_writepages(struct address_space *mapping,
> struct writeback_control *wbc);
> bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
> @@ -597,4 +600,60 @@ static inline void netfs_wait_for_outstanding_io(struct inode *inode)
> wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0);
> }
>
> +/*
> + * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap).
> + */
> +static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
> +{
> + if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE)

The netfs_group is a pointer. Is it correct comparison of pointer with the
NETFS_FOLIO_COPY_TO_CACHE constant?

> + refcount_inc(&netfs_group->ref);
> + return netfs_group;
> +}
> +
> +/*
> + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
> + */
> +static inline void netfs_put_group(struct netfs_group *netfs_group)
> +{
> + if (netfs_group &&
> + netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&

Ditto. The same question here.

> + refcount_dec_and_test(&netfs_group->ref))
> + netfs_group->free(netfs_group);
> +}
> +
> +/*
> + * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
> + */
> +static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
> +{
> + if (netfs_group &&
> + netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&

Ditto.

Thanks,
Slava.

> + refcount_sub_and_test(nr, &netfs_group->ref))
> + netfs_group->free(netfs_group);
> +}
> +
> +/*
> + * Set the group pointer directly on a folio.
> + */
> +static inline void __netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
> +{
> + if (netfs_group)
> + folio_attach_private(folio, netfs_get_group(netfs_group));
> +}
> +
> +/*
> + * Set the group pointer on a folio or the folio info record.
> + */
> +static inline void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
> +{
> + void *priv = folio_get_private(folio);
> +
> + if (unlikely(priv != netfs_group)) {
> + if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
> + folio_attach_private(folio, netfs_get_group(netfs_group));
> + else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
> + folio_detach_private(folio);
> + }
> +}
> +
> #endif /* _LINUX_NETFS_H */
>
>