[PATCH v5 03/10] fs: Initial atomic write support

From: John Garry
Date: Mon Feb 26 2024 - 12:41:46 EST


From: Prasad Singamsetty <prasad.singamsetty@xxxxxxxxxx>

An atomic write is a write issued with torn-write protection, meaning
that for a power failure or any other hardware failure, all or none of the
data from the write will be stored, but never a mix of old and new data.

Userspace may add flag RWF_ATOMIC to pwritev2() to indicate that the
write is to be issued with torn-write prevention, according to special
alignment and length rules.

For any syscall interface utilizing struct iocb, add IOCB_ATOMIC for
iocb->ki_flags field to indicate the same.

A call to statx will give the relevant atomic write info for a file:
- atomic_write_unit_min
- atomic_write_unit_max
- atomic_write_segments_max

Both min and max values must be a power-of-2.

Applications can avail of atomic write feature by ensuring that the total
length of a write is a power-of-2 in size and also sized between
atomic_write_unit_min and atomic_write_unit_max, inclusive. Applications
must ensure that the write is at a naturally-aligned offset in the file
wrt the total write length. The value in atomic_write_segments_max
indicates the upper limit for IOV_ITER iovcnt.

Add file mode flag FMODE_CAN_ATOMIC_WRITE, so files which do not have the
flag set will have RWF_ATOMIC rejected and not just ignored.

Add a type argument to kiocb_set_rw_flags() to allows reads which have
RWF_ATOMIC set to be rejected.

Helper function generic_atomic_write_valid() can be used by FSes to verify
compliant writes. There we check for iov_iter type is for ubuf, which
implies iovcnt==1 for pwritev2(), which is an initial restriction for
atomic_write_segments_max.

Signed-off-by: Prasad Singamsetty <prasad.singamsetty@xxxxxxxxxx>
jpg: merge into single patch and much rewrite
Signed-off-by: John Garry <john.g.garry@xxxxxxxxxx>
---
fs/aio.c | 8 ++++----
fs/btrfs/ioctl.c | 2 +-
fs/read_write.c | 2 +-
include/linux/fs.h | 37 ++++++++++++++++++++++++++++++++++++-
include/uapi/linux/fs.h | 5 ++++-
io_uring/rw.c | 4 ++--
6 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index da18dbcfcb22..ba420faed82e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1509,7 +1509,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
iocb_put(iocb);
}

-static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
{
int ret;

@@ -1535,7 +1535,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
} else
req->ki_ioprio = get_current_ioprio();

- ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
+ ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type);
if (unlikely(ret))
return ret;

@@ -1587,7 +1587,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
struct file *file;
int ret;

- ret = aio_prep_rw(req, iocb);
+ ret = aio_prep_rw(req, iocb, READ);
if (ret)
return ret;
file = req->ki_filp;
@@ -1614,7 +1614,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
struct file *file;
int ret;

- ret = aio_prep_rw(req, iocb);
+ ret = aio_prep_rw(req, iocb, WRITE);
if (ret)
return ret;
file = req->ki_filp;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ac3316e0d11c..455f06d94b11 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4555,7 +4555,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
goto out_iov;

init_sync_kiocb(&kiocb, file);
- ret = kiocb_set_rw_flags(&kiocb, 0);
+ ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
if (ret)
goto out_iov;
kiocb.ki_pos = pos;
diff --git a/fs/read_write.c b/fs/read_write.c
index d4c036e82b6c..a7dc1819192d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -730,7 +730,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
ssize_t ret;

init_sync_kiocb(&kiocb, filp);
- ret = kiocb_set_rw_flags(&kiocb, flags);
+ ret = kiocb_set_rw_flags(&kiocb, flags, type);
if (ret)
return ret;
kiocb.ki_pos = (ppos ? *ppos : 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1fbc72c5f112..95946a706f23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -43,6 +43,7 @@
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
+#include <linux/uio.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -119,6 +120,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define FMODE_PWRITE ((__force fmode_t)0x10)
/* File is opened for execution with sys_execve / sys_uselib */
#define FMODE_EXEC ((__force fmode_t)0x20)
+
+/* File supports atomic writes */
+#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)0x40)
+
/* 32bit hashes as llseek() offset (for directories) */
#define FMODE_32BITHASH ((__force fmode_t)0x200)
/* 64bit hashes as llseek() offset (for directories) */
@@ -328,6 +333,7 @@ enum rw_hint {
#define IOCB_SYNC (__force int) RWF_SYNC
#define IOCB_NOWAIT (__force int) RWF_NOWAIT
#define IOCB_APPEND (__force int) RWF_APPEND
+#define IOCB_ATOMIC (__force int) RWF_ATOMIC

/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD (1 << 16)
@@ -362,6 +368,7 @@ enum rw_hint {
{ IOCB_SYNC, "SYNC" }, \
{ IOCB_NOWAIT, "NOWAIT" }, \
{ IOCB_APPEND, "APPEND" }, \
+ { IOCB_ATOMIC, "ATOMIC"}, \
{ IOCB_EVENTFD, "EVENTFD"}, \
{ IOCB_DIRECT, "DIRECT" }, \
{ IOCB_WRITE, "WRITE" }, \
@@ -3323,7 +3330,8 @@ static inline int iocb_flags(struct file *file)
return res;
}

-static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
+static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
+ int rw_type)
{
int kiocb_flags = 0;

@@ -3340,6 +3348,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
return -EOPNOTSUPP;
kiocb_flags |= IOCB_NOIO;
}
+ if (flags & RWF_ATOMIC) {
+ if (rw_type != WRITE)
+ return -EOPNOTSUPP;
+ if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
+ return -EOPNOTSUPP;
+ }
kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
if (flags & RWF_SYNC)
kiocb_flags |= IOCB_DSYNC;
@@ -3525,4 +3539,25 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
int advice);

+static inline
+bool generic_atomic_write_valid(loff_t pos, struct iov_iter *iter,
+ unsigned int unit_min, unsigned int unit_max)
+{
+ size_t len = iov_iter_count(iter);
+
+ if (!iter_is_ubuf(iter))
+ return false;
+
+ if (len < unit_min || len > unit_max)
+ return false;
+
+ if (!is_power_of_2(len))
+ return false;
+
+ if (!IS_ALIGNED(pos, len))
+ return false;
+
+ return true;
+}
+
#endif /* _LINUX_FS_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 48ad69f7722e..a0975ae81e64 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -301,9 +301,12 @@ typedef int __bitwise __kernel_rwf_t;
/* per-IO O_APPEND */
#define RWF_APPEND ((__force __kernel_rwf_t)0x00000010)

+/* Atomic Write */
+#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040)
+
/* mask of flags supported by the kernel */
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
- RWF_APPEND)
+ RWF_APPEND | RWF_ATOMIC)

/* Pagemap ioctl */
#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg)
diff --git a/io_uring/rw.c b/io_uring/rw.c
index d5e79d9bdc71..099dda3ff151 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -719,7 +719,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
struct kiocb *kiocb = &rw->kiocb;
struct io_ring_ctx *ctx = req->ctx;
struct file *file = req->file;
- int ret;
+ int ret, rw_type = (mode == FMODE_WRITE) ? WRITE : READ;

if (unlikely(!file || !(file->f_mode & mode)))
return -EBADF;
@@ -728,7 +728,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
req->flags |= io_file_get_flags(file);

kiocb->ki_flags = file->f_iocb_flags;
- ret = kiocb_set_rw_flags(kiocb, rw->flags);
+ ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
if (unlikely(ret))
return ret;
kiocb->ki_flags |= IOCB_ALLOC_CACHE;
--
2.31.1