+
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
@@ -256,7 +275,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua)
+ const struct iomap *iomap, bool use_fua, bool atomic)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@@ -268,6 +287,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ if (atomic)
+ opflags |= REQ_ATOMIC;
return opflags;
}
@@ -275,21 +296,23 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
struct iomap_dio *dio)
{
+ bool atomic = dio->iocb->ki_flags & IOCB_ATOMIC;
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
+ struct iov_iter *i = dio->submit.iter;
If you're going to pull this out into a convenience variable, please do
that as a separate patch so that the actual untorn write additions don't
get mixed in.
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
bool need_zeroout = false;
bool use_fua = false;
- int nr_pages, ret = 0;
+ int nr_pages, orig_nr_pages, ret = 0;
size_t copied = 0;
size_t orig_count;
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
- !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
+ !bdev_iter_is_aligned(iomap->bdev, i))
return -EINVAL;
if (iomap->type == IOMAP_UNWRITTEN) {
@@ -322,15 +345,35 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
}
+ if (dio->atomic_bio) {
+ /*
+ * These should not fail, but check just in case.
+ * Caller takes care of freeing the bio.
+ */
+ if (iter->iomap.bdev != dio->atomic_bio->bi_bdev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (dio->atomic_bio->bi_iter.bi_sector +
+ (dio->atomic_bio->bi_iter.bi_size >> SECTOR_SHIFT) !=
Hmm, so I guess you stash an untorn write bio in the iomap_dio so that
multiple iomap_dio_bio_iter can try to combine a mixed mapping into a
single contiguous untorn write that can be completed all at once?
I suppose that works as long as the iomap->type is the same across all
the _iter calls, but I think that needs explicit checking here.
+ iomap_sector(iomap, pos)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else if (atomic) {
+ orig_nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
+ }
+
/*
* Save the original count and trim the iter to just the extent we
* are operating on right now. The iter will be re-expanded once
* we are done.
*/
- orig_count = iov_iter_count(dio->submit.iter);
- iov_iter_truncate(dio->submit.iter, length);
+ orig_count = iov_iter_count(i);
+ iov_iter_truncate(i, length);
- if (!iov_iter_count(dio->submit.iter))
+ if (!iov_iter_count(i))
goto out;
/*
@@ -365,27 +408,46 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* can set up the page vector appropriately for a ZONE_APPEND
* operation.
*/
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
+
+ if (atomic) {
+ size_t orig_atomic_size;
+
+ if (!dio->atomic_bio) {
+ dio->atomic_bio = iomap_dio_alloc_bio_data(iter,
+ dio, orig_nr_pages, bio_opf, pos);
+ }
+ orig_atomic_size = dio->atomic_bio->bi_iter.bi_size;
+
+ /*
+ * In case of error, caller takes care of freeing the bio. The
+ * smallest size of atomic write is i_node size, so no need for
What is "i_node size"? Are you referring to i_blocksize?
+ * tail zeroing out.
+ */
+ ret = bio_iov_iter_get_pages(dio->atomic_bio, i);
+ if (!ret) {
+ copied = dio->atomic_bio->bi_iter.bi_size -
+ orig_atomic_size;
+ }
- nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
+ dio->size += copied;
+ goto out;
+ }
+
+ nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
do {
size_t n;
if (dio->error) {
- iov_iter_revert(dio->submit.iter, copied);
+ iov_iter_revert(i, copied);
copied = ret = 0;
goto out;
}
- bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
+ bio = iomap_dio_alloc_bio_data(iter, dio, nr_pages, bio_opf, pos);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
- bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
- bio->bi_write_hint = inode->i_write_hint;
- bio->bi_ioprio = dio->iocb->ki_ioprio;
- bio->bi_private = dio;
- bio->bi_end_io = iomap_dio_bio_end_io;
I see two places (here and iomap_dio_zero) that allocate a bio and
perform some initialization of it. Can you move the common pieces to
iomap_dio_alloc_bio instead of adding a iomap_dio_alloc_bio_data
variant, and move all that to a separate cleanup patch?
- ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
+ ret = bio_iov_iter_get_pages(bio, i);
if (unlikely(ret)) {
/*
* We have to stop part way through an IO. We must fall
@@ -408,8 +470,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
dio->size += n;
copied += n;
- nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
- BIO_MAX_VECS);
+ nr_pages = bio_iov_vecs_to_alloc(i, BIO_MAX_VECS);
/*
* We can only poll for single bio I/Os.
*/
@@ -435,7 +496,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
out:
/* Undo iter limitation to current extent */
- iov_iter_reexpand(dio->submit.iter, orig_count - copied);
+ iov_iter_reexpand(i, orig_count - copied);
if (copied)
return copied;
return ret;
@@ -555,6 +616,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
struct blk_plug plug;
struct iomap_dio *dio;
loff_t ret = 0;
+ size_t orig_count = iov_iter_count(iter);
trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
@@ -580,6 +642,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ if (bio_iov_vecs_to_alloc(iter, INT_MAX) > BIO_MAX_VECS)
+ return ERR_PTR(-EINVAL);
+ iomi.flags |= IOMAP_ATOMIC;
+ }
+ dio->atomic_bio = NULL;
+
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -665,6 +734,21 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iocb->ki_flags &= ~IOCB_HIPRI;
}
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ if (ret >= 0) {
+ if (dio->size == orig_count) {
+ iomap_dio_submit_bio(&iomi, dio,
+ dio->atomic_bio, iocb->ki_pos);
Does this need to do task_io_account_write like regular direct writes
do?
+ } else {
+ if (dio->atomic_bio)
+ bio_put(dio->atomic_bio);
+ ret = -EINVAL;
+ }
+ } else if (dio->atomic_bio) {
+ bio_put(dio->atomic_bio);
This ought to null out dio->atomic_bio to prevent accidental UAF.