Re: [PATCH v2 00/16] block atomic writes

From: John Garry
Date: Tue Dec 19 2023 - 07:44:35 EST


On 19/12/2023 05:21, Christoph Hellwig wrote:
On Mon, Dec 18, 2023 at 09:14:56PM -0800, Darrick J. Wong wrote:
/me stumbles back in with plenty of covidbrain to go around.

So ... Christoph, you're asking for a common API for
sysadmins/applications to signal to the filesystem that they want all
data allocations aligned to a given value, right?

e.g. if a nvme device advertises a capability for untorn writes between
$lbasize and 64k, then we need a way to set up each untorn-file with
some alignment between $lbasize and 64k?

or if cxl storage becomes not ung-dly expensive, then we'd want a way to
set up files with an alignment of 2M (x86) or 512M (arm64lol) to take
advantage of PMD mmap mappings?

The most important point is to not mix these up.

If we want to use a file for atomic writes I should tell the fs about
it, and preferably in a way that does not require me to know about weird
internal implementation details of every file system. I really just
want to use atomic writes. Preferably I'd just start using them after
asking for the limits. But that's obviously not going to work for
file systems that use the hardware offload and don't otherwise align
to the limit (which would suck for very small files anyway :))

So as a compromise I tell the file system before writing or otherwise
adding any data [1] to file that I want to use atomic writes so that
the fs can take the right decisions.

[1] reflinking data into a such marked file will be ... interesting.


How about something based on fcntl, like below? We will prob also require some per-FS flag for enabling atomic writes without HW support. That flag might be also useful for XFS for differentiating forcealign for atomic writes with just forcealign.

---8<----

diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..d4a50655565a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -313,6 +313,15 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd,
}
}

+static long fcntl_atomic_write(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ if (file->f_op->enable_atomic_writes)
+ return file->f_op->enable_atomic_writes(file, arg);
+
+ return -EOPNOTSUPP;
+}
+
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
@@ -419,6 +428,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_SET_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
+ case F_SET_ATOMIC_WRITE_SIZE:
+ err = fcntl_atomic_write(filp, cmd, arg);
+ break;
default:
break;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f..dba206cbe1ab 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1478,6 +1478,46 @@ xfs_file_mmap(
return 0;
}

+STATIC int
+xfs_enable_atomic_writes(
+ struct file *file,
+ unsigned int unit_max)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct dentry *dentry = file->f_path.dentry;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+ struct request_queue *q = bdev->bd_queue;
+ struct inode *inode = &ip->i_vnode;
+
+ if (queue_atomic_write_unit_max_bytes(q) == 0) {
+ if (unit_max != 0) {
+ /* We expect unbounded CoW size if no HW support */
+ return -ENOTBLK;
+ }
+ /* Do something for CoW support ... */
+
+ return 0;
+ }
+
+ if (!unit_max)
+ return -EINVAL;
+
+ /* bodge alert */
+ if (inode->i_op->fileattr_set) {
+ struct fileattr fa = {
+ .fsx_extsize = unit_max,
+ .fsx_xflags = FS_XFLAG_EXTSIZE | FS_XFLAG_FORCEALIGN,
+ .fsx_valid = 1,
+ };
+
+ return inode->i_op->fileattr_set(idmap, dentry, &fa);
+ }
+
+ return -EOPNOTSUPP;
+}
+
const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
.read_iter = xfs_file_read_iter,
@@ -1498,6 +1538,7 @@ const struct file_operations xfs_file_operations = {
.fallocate = xfs_file_fallocate,
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
+ .enable_atomic_writes = xfs_enable_atomic_writes,
};

const struct file_operations xfs_dir_file_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..a715f98057b8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1956,6 +1956,7 @@ struct file_operations {
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
unsigned int poll_flags);
+ int (*enable_atomic_writes)(struct file *, unsigned int unit_max);
} __randomize_layout;

/* Wrap a directory iterator that needs exclusive inode access */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6c80f96049bd..69780c49622e 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -56,6 +56,8 @@
#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13)
#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14)

+#define F_SET_ATOMIC_WRITE_SIZE (F_LINUX_SPECIFIC_BASE + 15)
+
/*
* Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
* used to clear any hints previously set.
--


--->8----