Re: [PATCH v7 14/14] xfs: allow sysadmins to specify a maximum atomic write limit at mount time

From: Darrick J. Wong
Date: Tue Apr 15 2025 - 13:03:24 EST


On Tue, Apr 15, 2025 at 12:14:25PM +0000, John Garry wrote:
> From: "Darrick J. Wong" <djwong@xxxxxxxxxx>
>
> Introduce a mount option to allow sysadmins to specify the maximum size
> of an atomic write. When this happens, we dynamically recompute the
> tr_atomic_write transaction reservation based on the given block size,
> and then check that we don't violate any of the minimum log size
> constraints.
>
> The actual software atomic write max is still computed based off of
> tr_atomic the same way it has for the past few commits.
>
> Signed-off-by: "Darrick J. Wong" <djwong@xxxxxxxxxx>
> Signed-off-by: John Garry <john.g.garry@xxxxxxxxxx>
> ---
> Documentation/admin-guide/xfs.rst | 8 +++++
> fs/xfs/libxfs/xfs_trans_resv.c | 54 +++++++++++++++++++++++++++++++
> fs/xfs/libxfs/xfs_trans_resv.h | 1 +
> fs/xfs/xfs_mount.c | 8 ++++-
> fs/xfs/xfs_mount.h | 5 +++
> fs/xfs/xfs_super.c | 28 +++++++++++++++-
> fs/xfs/xfs_trace.h | 33 +++++++++++++++++++
> 7 files changed, 135 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
> index b67772cf36d6..715019ec4f24 100644
> --- a/Documentation/admin-guide/xfs.rst
> +++ b/Documentation/admin-guide/xfs.rst
> @@ -143,6 +143,14 @@ When mounting an XFS filesystem, the following options are accepted.
> optional, and the log section can be separate from the data
> section or contained within it.
>
> + max_atomic_write=value
> + Set the maximum size of an atomic write. The size may be
> + specified in bytes, in kilobytes with a "k" suffix, in megabytes
> + with a "m" suffix, or in gigabytes with a "g" suffix.
> +
> + The default value is to set the maximum io completion size
> + to allow each CPU to handle one at a time.
> +
> noalign
> Data allocations will not be aligned at stripe unit
> boundaries. This is only relevant to filesystems created
> diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
> index f530aa5d72f5..36e47ec3c3c2 100644
> --- a/fs/xfs/libxfs/xfs_trans_resv.c
> +++ b/fs/xfs/libxfs/xfs_trans_resv.c
> @@ -1475,3 +1475,57 @@ xfs_calc_max_atomic_write_fsblocks(
>
> return ret;
> }
> +
> +/*
> + * Compute the log reservation needed to complete an atomic write of a given
> + * number of blocks. Worst case, each block requires separate handling.
> + * Returns true if the blockcount is supported, false otherwise.
> + */
> +bool
> +xfs_calc_atomic_write_reservation(
> + struct xfs_mount *mp,
> + int bytes)

Hmm, the comment says this should be a block count, not a byte count.

xfs_extlen_t blockcount)

> +{
> + struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend;
> + unsigned int per_intent, step_size;
> + unsigned int logres;
> + xfs_extlen_t blockcount = XFS_B_TO_FSBT(mp, bytes);
> + uint old_logres =
> + M_RES(mp)->tr_atomic_ioend.tr_logres;
> + int min_logblocks;
> +
> + /*
> + * If the caller doesn't ask for a specific atomic write size, then
> + * we'll use conservatively use tr_itruncate as the basis for computing
> + * a reasonable maximum.
> + */
> + if (blockcount == 0) {
> + curr_res->tr_logres = M_RES(mp)->tr_itruncate.tr_logres;
> + return true;
> + }
> +
> + /* Untorn write completions require out of place write remapping */
> + if (!xfs_has_reflink(mp))
> + return false;
> +
> + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
> +
> + if (check_mul_overflow(blockcount, per_intent, &logres))
> + return false;
> + if (check_add_overflow(logres, step_size, &logres))
> + return false;
> +
> + curr_res->tr_logres = logres;
> + min_logblocks = xfs_log_calc_minimum_size(mp);
> +
> + trace_xfs_calc_max_atomic_write_reservation(mp, per_intent, step_size,
> + blockcount, min_logblocks, curr_res->tr_logres);
> +
> + if (min_logblocks > mp->m_sb.sb_logblocks) {
> + /* Log too small, revert changes. */
> + curr_res->tr_logres = old_logres;
> + return false;
> + }
> +
> + return true;
> +}
> diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
> index a6d303b83688..af974f920556 100644
> --- a/fs/xfs/libxfs/xfs_trans_resv.h
> +++ b/fs/xfs/libxfs/xfs_trans_resv.h
> @@ -122,5 +122,6 @@ unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
> unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
>
> xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
> +bool xfs_calc_atomic_write_reservation(struct xfs_mount *mp, int bytes);
>
> #endif /* __XFS_TRANS_RESV_H__ */
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 860fc3c91fd5..b8dd9e956c2a 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -671,7 +671,7 @@ static inline unsigned int max_pow_of_two_factor(const unsigned int nr)
> return 1 << (ffs(nr) - 1);
> }
>
> -static inline void
> +void
> xfs_compute_atomic_write_unit_max(
> struct xfs_mount *mp)
> {
> @@ -1160,6 +1160,12 @@ xfs_mountfs(
> * derived from transaction reservations, so we must do this after the
> * log is fully initialized.
> */
> + if (!xfs_calc_atomic_write_reservation(mp, mp->m_awu_max_bytes)) {
> + xfs_warn(mp, "cannot support atomic writes of %u bytes",
> + mp->m_awu_max_bytes);
> + error = -EINVAL;
> + goto out_agresv;
> + }

Hmm. I don't think this is sufficient validation of m_awu_max_bytes.
xfs_compute_atomic_write_unit_max never allows an atomic write that is
larger than MAX_RW_COUNT or larger than the allocation group, because
it's not possible to land a single write larger than either of those
sizes. The parsing code ignores values that aren't congruent with
an fsblock size, and suffix_kstrtoint gets confused if you feed it
values that are too large (like "2g"). I propose something like this:

/*
* Try to set the atomic write maximum to a new value that we got from
* userspace via mount option.
*/
int
xfs_set_max_atomic_write_opt(
struct xfs_mount *mp,
unsigned long long new_max_bytes)
{
xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);

if (new_max_bytes) {
xfs_extlen_t max_write_fsbs =
rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
xfs_extlen_t max_group_fsbs =
max(mp->m_groups[XG_TYPE_AG].blocks,
mp->m_groups[XG_TYPE_RTG].blocks);

ASSERT(max_write_fsbs <= U32_MAX);

if (new_max_bytes % mp->m_sb.sb_blocksize > 0) {
xfs_warn(mp,
"max atomic write size of %llu bytes not aligned with fsblock",
new_max_bytes);
return -EINVAL;
}

if (new_max_fsbs > max_write_fsbs) {
xfs_warn(mp,
"max atomic write size of %lluk cannot be larger than max write size %lluk",
new_max_bytes >> 10,
XFS_FSB_TO_B(mp, max_write_fsbs) >> 10);
return -EINVAL;
}

if (new_max_fsbs > max_group_fsbs) {
xfs_warn(mp,
"max atomic write size of %lluk cannot be larger than allocation group size %lluk",
new_max_bytes >> 10,
XFS_FSB_TO_B(mp, max_group_fsbs) >> 10);
return -EINVAL;
}
}

if (!xfs_calc_atomic_write_reservation(mp, new_max_fsbs)) {
xfs_warn(mp,
"cannot support completing atomic writes of %lluk",
new_max_bytes >> 10);
return -EINVAL;
}

xfs_compute_atomic_write_unit_max(mp);
return 0;
}

and then we get some nicer error messages about what exactly failed
validation.

> xfs_compute_atomic_write_unit_max(mp);
>
> return 0;
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index c0eff3adfa31..a5037db4ecff 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -236,6 +236,9 @@ typedef struct xfs_mount {
> bool m_update_sb; /* sb needs update in mount */
> unsigned int m_max_open_zones;
>
> + /* max_atomic_write mount option value */
> + unsigned int m_awu_max_bytes;
> +
> /*
> * Bitsets of per-fs metadata that have been checked and/or are sick.
> * Callers must hold m_sb_lock to access these two fields.
> @@ -798,4 +801,6 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
> percpu_counter_add(&mp->m_delalloc_blks, delta);
> }
>
> +void xfs_compute_atomic_write_unit_max(struct xfs_mount *mp);
> +
> #endif /* __XFS_MOUNT_H__ */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b2dd0c0bf509..f7849052e5ff 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -111,7 +111,7 @@ enum {
> Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
> Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
> Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
> - Opt_lifetime, Opt_nolifetime,
> + Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
> };
>
> static const struct fs_parameter_spec xfs_fs_parameters[] = {
> @@ -159,6 +159,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
> fsparam_u32("max_open_zones", Opt_max_open_zones),
> fsparam_flag("lifetime", Opt_lifetime),
> fsparam_flag("nolifetime", Opt_nolifetime),
> + fsparam_string("max_atomic_write", Opt_max_atomic_write),
> {}
> };
>
> @@ -241,6 +242,9 @@ xfs_fs_show_options(
>
> if (mp->m_max_open_zones)
> seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
> + if (mp->m_awu_max_bytes)
> + seq_printf(m, ",max_atomic_write=%uk",
> + mp->m_awu_max_bytes >> 10);
>
> return 0;
> }
> @@ -1518,6 +1522,13 @@ xfs_fs_parse_param(
> case Opt_nolifetime:
> parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
> return 0;
> + case Opt_max_atomic_write:
> + if (suffix_kstrtoint(param->string, 10,
> + &parsing_mp->m_awu_max_bytes))

Let's replace this with a new suffix_kstrtoull helper that returns an
unsigned long long quantity that won't get confused. This has the
unfortunate consequence that we have to burn a u64 in xfs_mount instead
of a u32.

--D

> + return -EINVAL;
> + if (parsing_mp->m_awu_max_bytes < 0)
> + return -EINVAL;
> + return 0;
> default:
> xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
> return -EINVAL;
> @@ -2114,6 +2125,16 @@ xfs_fs_reconfigure(
> if (error)
> return error;
>
> + /* validate new max_atomic_write option before making other changes */
> + if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
> + if (!xfs_calc_atomic_write_reservation(mp,
> + new_mp->m_awu_max_bytes)) {
> + xfs_warn(mp, "cannot support atomic writes of %u bytes",
> + new_mp->m_awu_max_bytes);
> + return -EINVAL;
> + }
> + }
> +
> /* inode32 -> inode64 */
> if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
> mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
> @@ -2140,6 +2161,11 @@ xfs_fs_reconfigure(
> return error;
> }
>
> + /* set new atomic write max here */
> + if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
> + xfs_compute_atomic_write_unit_max(mp);
> + mp->m_awu_max_bytes = new_mp->m_awu_max_bytes;
> + }
> return 0;
> }
>
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 24d73e9bbe83..d41885f1efe2 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -230,6 +230,39 @@ TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
> __entry->blockcount)
> );
>
> +TRACE_EVENT(xfs_calc_max_atomic_write_reservation,
> + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
> + unsigned int step_size, unsigned int blockcount,
> + unsigned int min_logblocks, unsigned int logres),
> + TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __field(unsigned int, per_intent)
> + __field(unsigned int, step_size)
> + __field(unsigned int, blockcount)
> + __field(unsigned int, min_logblocks)
> + __field(unsigned int, cur_logblocks)
> + __field(unsigned int, logres)
> + ),
> + TP_fast_assign(
> + __entry->dev = mp->m_super->s_dev;
> + __entry->per_intent = per_intent;
> + __entry->step_size = step_size;
> + __entry->blockcount = blockcount;
> + __entry->min_logblocks = min_logblocks;
> + __entry->cur_logblocks = mp->m_sb.sb_logblocks;
> + __entry->logres = logres;
> + ),
> + TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
> + MAJOR(__entry->dev), MINOR(__entry->dev),
> + __entry->per_intent,
> + __entry->step_size,
> + __entry->blockcount,
> + __entry->min_logblocks,
> + __entry->cur_logblocks,
> + __entry->logres)
> +);
> +
> TRACE_EVENT(xlog_intent_recovery_failed,
> TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
> int error),
> --
> 2.31.1
>
>