Re: [RFC PATCH v2 2/5] iomap: Add initial support for buffered RWF_WRITETHROUGH

From: Jan Kara

Date: Thu Apr 16 2026 - 08:39:16 EST

Some more thoughs... :)

> @@ -1096,6 +1097,276 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
> return __iomap_write_end(iter->inode, pos, len, copied, folio);
> }
>
> +static ssize_t iomap_writethrough_complete(struct iomap_writethrough_ctx *wt_ctx)
> +{
> + struct kiocb *iocb = wt_ctx->iocb;
> + struct inode *inode = wt_ctx->inode;
> + ssize_t ret = wt_ctx->error;
> +
> + if (wt_ctx->dops && wt_ctx->dops->end_io) {
> + int err = wt_ctx->dops->end_io(iocb, wt_ctx->written,
> + wt_ctx->error,
> + wt_ctx->flags);

It's a bit odd to use only ->end_io from dops also because we don't really
use direct IO submission path. So perhaps you can have just an end_io
handler pointer in iomap_writethrough_ops similarly as you have a
submission one?

> + if (err)
> + ret = err;
> + }
> +
> + mapping_clear_stable_writes(inode->i_mapping);
> +
> + if (!ret) {
> + ret = wt_ctx->written;
> + iocb->ki_pos = wt_ctx->pos + ret;
> + }
> +
> + kfree(wt_ctx);
> + return ret;
> +}

...

> +static int iomap_writethrough_iter(struct iomap_writethrough_ctx *wt_ctx,
> + struct iomap_iter *iter, struct iov_iter *i,
> + const struct iomap_writethrough_ops *wt_ops)
> +
> +{
> + ssize_t total_written = 0;
> + int status = 0;
> + struct address_space *mapping = iter->inode->i_mapping;
> + size_t chunk = mapping_max_folio_size(mapping);
> + unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
> + unsigned int bs = i_blocksize(iter->inode);
> +
> + /* copied over based on DIO handles these flags */
> + if (iter->iomap.type == IOMAP_UNWRITTEN)
> + wt_ctx->flags |= IOMAP_DIO_UNWRITTEN;
> + if (iter->iomap.flags & IOMAP_F_SHARED)
> + wt_ctx->flags |= IOMAP_DIO_COW;
> +
> + if (!(iter->flags & IOMAP_WRITETHROUGH))
> + return -EINVAL;
> +
> + do {
> + struct folio *folio;
> + size_t offset; /* Offset into folio */
> + u64 bytes; /* Bytes to write to folio */
> + size_t copied; /* Bytes copied from user */
> + u64 written; /* Bytes have been written */
> + loff_t pos;
> + size_t off_aligned, len_aligned;
> +
> + bytes = iov_iter_count(i);
> +retry:
> + offset = iter->pos & (chunk - 1);
> + bytes = min(chunk - offset, bytes);
> + status = balance_dirty_pages_ratelimited_flags(mapping,
> + bdp_flags);
> + if (unlikely(status))
> + break;
> +
> + /*
> + * If completions already occurred and reported errors, give up
> + * now and don't bother submitting more bios.
> + */
> + if (unlikely(data_race(wt_ctx->error))) {
> + wt_ctx->nr_bvecs = 0;
> + break;
> + }
> +
> + if (bytes > iomap_length(iter))
> + bytes = iomap_length(iter);
> +
> + /*
> + * Bring in the user page that we'll copy from _first_.
> + * Otherwise there's a nasty deadlock on copying from the
> + * same page as we're writing to, without it being marked
> + * up-to-date.
> + *
> + * For async buffered writes the assumption is that the user
> + * page has already been faulted in. This can be optimized by
> + * faulting the user page.
> + */
> + if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
> + status = -EFAULT;
> + break;
> + }
> +
> + status = iomap_write_begin(iter, wt_ops->write_ops, &folio,
> + &offset, &bytes);
> + if (unlikely(status)) {
> + iomap_write_failed(iter->inode, iter->pos, bytes);
> + break;
> + }
> + if (iter->iomap.flags & IOMAP_F_STALE)
> + break;
> +
> + pos = iter->pos;
> +
> + if (mapping_writably_mapped(mapping))
> + flush_dcache_folio(folio);
> +
> + copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> + written = iomap_write_end(iter, bytes, copied, folio) ?
> + copied : 0;
> +
> + if (!written)
> + goto put_folio;
> +
> + off_aligned = round_down(offset, bs);
> + len_aligned = round_up(offset + written, bs) - off_aligned;
> +
> + iomap_folio_prepare_writethrough(folio, off_aligned,
> + len_aligned);
> +
> + if (!wt_ctx->nr_bvecs)
> + wt_ctx->bio_pos = round_down(pos, bs);
> +
> + bvec_set_folio(&wt_ctx->bvec[wt_ctx->nr_bvecs], folio,
> + len_aligned, off_aligned);

Shouldn't we zero out the tail of the folio if we are submitting partial
folio for write?

> + wt_ctx->nr_bvecs++;
> + wt_ctx->written += written;
> +
> + if (pos + written > wt_ctx->new_i_size)
> + wt_ctx->new_i_size = pos + written;

I'm probably missing something here but where is i_size update handled? I
don't see new_i_size used anywhere? Also why is it OK to not call
pagecache_isize_extended() but that goes with the i_size update...

> +
> + if (wt_ctx->nr_bvecs == wt_ctx->max_bvecs)
> + iomap_writethrough_submit_bio(wt_ctx, &iter->iomap, wt_ops);
> +
> +put_folio:
> + __iomap_put_folio(iter, wt_ops->write_ops, written, folio);
> +
> + cond_resched();
> + if (unlikely(written == 0)) {
> + iomap_write_failed(iter->inode, pos, bytes);
> + iov_iter_revert(i, copied);
> +
> + if (chunk > PAGE_SIZE)
> + chunk /= 2;
> + if (copied) {
> + bytes = copied;
> + goto retry;
> + }
> + } else {
> + total_written += written;
> + iomap_iter_advance(iter, written);
> + }
> + } while (iov_iter_count(i) && iomap_length(iter));

Overall the differences of this function from iomap_write_iter() seem
relatively small so maybe it would be possible to just extend
iomap_write_iter() to support writethrough IO as well? Basically once we've
copied data into the folio and called iomap_write_end() we can have "if
writethrough, call function to prepare & submit the folio for IO".

> +
> + if (wt_ctx->nr_bvecs)
> + iomap_writethrough_submit_bio(wt_ctx, &iter->iomap, wt_ops);
> +
> + return total_written ? 0 : status;
> +}
> +

Honza
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR