[RFC PATCH 51/61] afs: Use ITER_MAPPING for writing

From: David Howells
Date: Mon May 04 2020 - 13:15:35 EST


Use a single ITER_MAPPING iterator to describe the portion of a file to be
transmitted to the server rather than generating a series of small
ITER_BVEC iterators on the fly. This will make it easier to implement AIO
in afs.

In theory we could maybe use one giant ITER_BVEC, but that means
potentially allocating a huge array of bio_vec structs (max 256 per page)
when in fact the pagecache already has a structure listing all the relevant
pages (radix_tree/xarray) that can be walked over.

Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
---

fs/afs/fsclient.c | 38 ++++-------------
fs/afs/internal.h | 18 +++-----
fs/afs/rxrpc.c | 99 +++++++-------------------------------------
fs/afs/write.c | 80 +++++++++++++++++++-----------------
fs/afs/yfsclient.c | 19 ++------
include/trace/events/afs.h | 51 ++++++++---------------
6 files changed, 96 insertions(+), 209 deletions(-)

diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 8222ccf01280..db80c2618778 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1108,10 +1108,7 @@ static const struct afs_call_type afs_RXFSStoreData64 = {
/*
* store a set of pages to a very large file
*/
-static int afs_fs_store_data64(struct afs_fs_cursor *fc,
- struct address_space *mapping,
- pgoff_t first, pgoff_t last,
- unsigned offset, unsigned to,
+static int afs_fs_store_data64(struct afs_fs_cursor *fc, struct iov_iter *iter,
loff_t size, loff_t pos, loff_t i_size,
struct afs_status_cb *scb)
{
@@ -1130,12 +1127,7 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
return -ENOMEM;

call->key = fc->key;
- call->mapping = mapping;
- call->first = first;
- call->last = last;
- call->first_offset = offset;
- call->last_to = to;
- call->send_pages = true;
+ call->write_iter = iter;
call->out_scb = scb;

/* marshall the parameters */
@@ -1166,30 +1158,24 @@ static int afs_fs_store_data64(struct afs_fs_cursor *fc,
}

/*
- * store a set of pages
+ * Write data to a file on the server.
*/
-int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
- pgoff_t first, pgoff_t last,
- unsigned offset, unsigned to,
+int afs_fs_store_data(struct afs_fs_cursor *fc, struct iov_iter *iter, loff_t pos,
struct afs_status_cb *scb)
{
struct afs_vnode *vnode = fc->vnode;
struct afs_call *call;
struct afs_net *net = afs_v2net(vnode);
- loff_t size, pos, i_size;
+ loff_t size, i_size;
__be32 *bp;

if (test_bit(AFS_SERVER_FL_IS_YFS, &fc->cbi->server->flags))
- return yfs_fs_store_data(fc, mapping, first, last, offset, to, scb);
+ return yfs_fs_store_data(fc, iter, pos, scb);

_enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);

- size = (loff_t)to - (loff_t)offset;
- if (first != last)
- size += (loff_t)(last - first) << PAGE_SHIFT;
- pos = (loff_t)first << PAGE_SHIFT;
- pos += offset;
+ size = iov_iter_count(iter);

i_size = i_size_read(&vnode->vfs_inode);
if (pos + size > i_size)
@@ -1200,8 +1186,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
(unsigned long long) i_size);

if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
- return afs_fs_store_data64(fc, mapping, first, last, offset, to,
- size, pos, i_size, scb);
+ return afs_fs_store_data64(fc, iter, size, pos, i_size, scb);

call = afs_alloc_flat_call(net, &afs_RXFSStoreData,
(4 + 6 + 3) * 4,
@@ -1210,12 +1195,7 @@ int afs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
return -ENOMEM;

call->key = fc->key;
- call->mapping = mapping;
- call->first = first;
- call->last = last;
- call->first_offset = offset;
- call->last_to = to;
- call->send_pages = true;
+ call->write_iter = iter;
call->out_scb = scb;

/* marshall the parameters */
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index e676ad145272..0cd9e998d52c 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -118,6 +118,7 @@ struct afs_call {
struct address_space *mapping; /* Pages being written from */
size_t iov_len; /* Size of *iter to be used */
struct iov_iter def_iter; /* Default buffer/data iterator */
+ struct iov_iter *write_iter; /* Iterator defining write to be made */
struct iov_iter *iter; /* Iterator currently in use */
union { /* Convenience for ->def_iter */
struct kvec kvec[1];
@@ -138,8 +139,6 @@ struct afs_call {
struct afs_volume_status *out_volstatus;
struct afs_read *read_request;
unsigned int server_index;
- pgoff_t first; /* first page in mapping to deal with */
- pgoff_t last; /* last page in mapping to deal with */
atomic_t usage;
enum afs_call_state state;
spinlock_t state_lock;
@@ -149,15 +148,10 @@ struct afs_call {
unsigned int max_lifespan; /* Maximum lifespan to set if not 0 */
unsigned request_size; /* size of request data */
unsigned reply_max; /* maximum size of reply */
- unsigned first_offset; /* offset into mapping[first] */
- union {
- unsigned last_to; /* amount of mapping[last] */
- unsigned count2; /* count used in unmarshalling */
- };
+ unsigned count2; /* count used in unmarshalling */
unsigned char unmarshall; /* unmarshalling phase */
unsigned char addr_ix; /* Address in ->alist */
bool drop_ref; /* T if need to drop ref for incoming call */
- bool send_pages; /* T if data from mapping should be sent */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
@@ -962,8 +956,8 @@ extern int afs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
extern int afs_fs_rename(struct afs_fs_cursor *, const char *,
struct afs_vnode *, const char *,
struct afs_status_cb *, struct afs_status_cb *);
-extern int afs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
- pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *);
+extern int afs_fs_store_data(struct afs_fs_cursor *, struct iov_iter *, loff_t,
+ struct afs_status_cb *);
extern int afs_fs_setattr(struct afs_fs_cursor *, struct iattr *, struct afs_status_cb *);
extern int afs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
extern int afs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t, struct afs_status_cb *);
@@ -1378,8 +1372,8 @@ extern int yfs_fs_symlink(struct afs_fs_cursor *, const char *, const char *,
struct afs_status_cb *, struct afs_fid *, struct afs_status_cb *);
extern int yfs_fs_rename(struct afs_fs_cursor *, const char *, struct afs_vnode *, const char *,
struct afs_status_cb *, struct afs_status_cb *);
-extern int yfs_fs_store_data(struct afs_fs_cursor *, struct address_space *,
- pgoff_t, pgoff_t, unsigned, unsigned, struct afs_status_cb *);
+extern int yfs_fs_store_data(struct afs_fs_cursor *, struct iov_iter *, loff_t,
+ struct afs_status_cb *);
extern int yfs_fs_setattr(struct afs_fs_cursor *, struct iattr *, struct afs_status_cb *);
extern int yfs_fs_get_volume_status(struct afs_fs_cursor *, struct afs_volume_status *);
extern int yfs_fs_set_lock(struct afs_fs_cursor *, afs_lock_type_t, struct afs_status_cb *);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index b2296feaaff3..98da499232a3 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -274,39 +274,6 @@ void afs_flat_call_destructor(struct afs_call *call)
call->buffer = NULL;
}

-#define AFS_BVEC_MAX 8
-
-/*
- * Load the given bvec with the next few pages.
- */
-static void afs_load_bvec(struct afs_call *call, struct msghdr *msg,
- struct bio_vec *bv, pgoff_t first, pgoff_t last,
- unsigned offset)
-{
- struct page *pages[AFS_BVEC_MAX];
- unsigned int nr, n, i, to, bytes = 0;
-
- nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX);
- n = find_get_pages_contig(call->mapping, first, nr, pages);
- ASSERTCMP(n, ==, nr);
-
- msg->msg_flags |= MSG_MORE;
- for (i = 0; i < nr; i++) {
- to = PAGE_SIZE;
- if (first + i >= last) {
- to = call->last_to;
- msg->msg_flags &= ~MSG_MORE;
- }
- bv[i].bv_page = pages[i];
- bv[i].bv_len = to - offset;
- bv[i].bv_offset = offset;
- bytes += to - offset;
- offset = 0;
- }
-
- iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes);
-}
-
/*
* Advance the AFS call state when the RxRPC call ends the transmit phase.
*/
@@ -319,41 +286,6 @@ static void afs_notify_end_request_tx(struct sock *sock,
afs_set_call_state(call, AFS_CALL_CL_REQUESTING, AFS_CALL_CL_AWAIT_REPLY);
}

-/*
- * attach the data from a bunch of pages on an inode to a call
- */
-static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
-{
- struct bio_vec bv[AFS_BVEC_MAX];
- unsigned int bytes, nr, loop, offset;
- pgoff_t first = call->first, last = call->last;
- int ret;
-
- offset = call->first_offset;
- call->first_offset = 0;
-
- do {
- afs_load_bvec(call, msg, bv, first, last, offset);
- trace_afs_send_pages(call, msg, first, last, offset);
-
- offset = 0;
- bytes = msg->msg_iter.count;
- nr = msg->msg_iter.nr_segs;
-
- ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, msg,
- bytes, afs_notify_end_request_tx);
- for (loop = 0; loop < nr; loop++)
- put_page(bv[loop].bv_page);
- if (ret < 0)
- break;
-
- first += nr;
- } while (first <= last);
-
- trace_afs_sent_pages(call, call->first, last, first, ret);
- return ret;
-}
-
/*
* Initiate a call and synchronously queue up the parameters for dispatch. Any
* error is stored into the call struct, which the caller must check for.
@@ -385,19 +317,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
* after the initial fixed part.
*/
tx_total_len = call->request_size;
- if (call->send_pages) {
- if (call->last == call->first) {
- tx_total_len += call->last_to - call->first_offset;
- } else {
- /* It looks mathematically like you should be able to
- * combine the following lines with the ones above, but
- * unsigned arithmetic is fun when it wraps...
- */
- tx_total_len += PAGE_SIZE - call->first_offset;
- tx_total_len += call->last_to;
- tx_total_len += (call->last - call->first - 1) * PAGE_SIZE;
- }
- }
+ if (call->write_iter)
+ tx_total_len += iov_iter_count(call->write_iter);

/* If the call is going to be asynchronous, we need an extra ref for
* the call to hold itself so the caller need not hang on to its ref.
@@ -439,7 +360,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
- msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0);
+ msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0);

ret = rxrpc_kernel_send_data(call->net->socket, rxcall,
&msg, call->request_size,
@@ -447,8 +368,18 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
if (ret < 0)
goto error_do_abort;

- if (call->send_pages) {
- ret = afs_send_pages(call, &msg);
+ if (call->write_iter) {
+ msg.msg_iter = *call->write_iter;
+ msg.msg_flags &= ~MSG_MORE;
+ trace_afs_send_data(call, &msg);
+
+ ret = rxrpc_kernel_send_data(call->net->socket,
+ call->rxcall, &msg,
+ iov_iter_count(&msg.msg_iter),
+ afs_notify_end_request_tx);
+ *call->write_iter = msg.msg_iter;
+
+ trace_afs_sent_data(call, &msg, ret);
if (ret < 0)
goto error_do_abort;
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 174e355aee6d..44dd4d0bad70 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -311,38 +311,30 @@ static void afs_redirty_pages(struct writeback_control *wbc,
/*
* completion of write to server
*/
-static void afs_pages_written_back(struct afs_vnode *vnode,
- pgoff_t first, pgoff_t last)
+static void afs_pages_written_back(struct afs_vnode *vnode, pgoff_t start, pgoff_t last)
{
- struct pagevec pv;
+ struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct page *page;
unsigned long priv;
- unsigned count, loop;
+
+ XA_STATE(xas, &mapping->i_pages, start);

_enter("{%llx:%llu},{%lx-%lx}",
- vnode->fid.vid, vnode->fid.vnode, first, last);
+ vnode->fid.vid, vnode->fid.vnode, start, last);

- pagevec_init(&pv);
+ rcu_read_lock();

- do {
- _debug("done %lx-%lx", first, last);
+ xas_for_each(&xas, page, last) {
+ ASSERT(PageWriteback(page));

- count = last - first + 1;
- if (count > PAGEVEC_SIZE)
- count = PAGEVEC_SIZE;
- pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
- first, count, pv.pages);
- ASSERTCMP(pv.nr, ==, count);
+ priv = page_private(page);
+ trace_afs_page_dirty(vnode, tracepoint_string("clear"),
+ page->index, priv);
+ set_page_private(page, 0);
+ page_endio(page, true, 0);
+ }

- for (loop = 0; loop < count; loop++) {
- priv = page_private(pv.pages[loop]);
- trace_afs_page_dirty(vnode, tracepoint_string("clear"),
- pv.pages[loop]->index, priv);
- set_page_private(pv.pages[loop], 0);
- end_page_writeback(pv.pages[loop]);
- }
- first += count;
- __pagevec_release(&pv);
- } while (first <= last);
+ rcu_read_unlock();

afs_prune_wb_keys(vnode);
_leave("");
@@ -351,23 +343,22 @@ static void afs_pages_written_back(struct afs_vnode *vnode,
/*
* write to a file
*/
-static int afs_store_data(struct address_space *mapping,
- pgoff_t first, pgoff_t last,
- unsigned offset, unsigned to)
+static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter,
+ loff_t pos, pgoff_t first, pgoff_t last)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct afs_fs_cursor fc;
struct afs_status_cb *scb;
struct afs_wb_key *wbk = NULL;
struct list_head *p;
+ loff_t count = iov_iter_count(iter);
int ret = -ENOKEY, ret2;

- _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x",
+ _enter("%s{%llx:%llu.%u},%llx,%llx",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
- first, last, offset, to);
+ count, pos);

scb = kzalloc(sizeof(struct afs_status_cb), GFP_NOFS);
if (!scb)
@@ -407,7 +398,7 @@ static int afs_store_data(struct address_space *mapping,

while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(vnode);
- afs_fs_store_data(&fc, mapping, first, last, offset, to, scb);
+ afs_fs_store_data(&fc, iter, pos, scb);
}

afs_check_for_remote_deletion(&fc, vnode);
@@ -421,9 +412,7 @@ static int afs_store_data(struct address_space *mapping,
switch (ret) {
case 0:
afs_stat_v(vnode, n_stores);
- atomic_long_add((last * PAGE_SIZE + to) -
- (first * PAGE_SIZE + offset),
- &afs_v2net(vnode)->n_store_bytes);
+ atomic_long_add(count, &afs_v2net(vnode)->n_store_bytes);
break;
case -EACCES:
case -EPERM:
@@ -454,10 +443,12 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
pgoff_t final_page)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct iov_iter iter;
struct page *pages[8], *page;
unsigned long count, priv;
unsigned n, offset, to, f, t;
pgoff_t start, first, last;
+ loff_t a, b;
int loop, ret;

_enter(",%lx", primary_page->index);
@@ -557,10 +548,17 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,

first = primary_page->index;
last = first + count - 1;
-
_debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);

- ret = afs_store_data(mapping, first, last, offset, to);
+ a = first;
+ a <<= PAGE_SHIFT;
+ a += offset;
+ b = last;
+ b <<= PAGE_SHIFT;
+ b += to;
+ iov_iter_mapping(&iter, WRITE, mapping, a, b - a);
+
+ ret = afs_store_data(vnode, &iter, a, first, last);
switch (ret) {
case 0:
ret = count;
@@ -848,6 +846,8 @@ int afs_launder_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct iov_iter iter;
+ struct bio_vec bv[1];
unsigned long priv;
unsigned int f, t;
int ret = 0;
@@ -863,9 +863,15 @@ int afs_launder_page(struct page *page)
t = priv >> AFS_PRIV_SHIFT;
}

+ bv[0].bv_page = page;
+ bv[0].bv_offset = f;
+ bv[0].bv_len = t - f;
+ iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
+
trace_afs_page_dirty(vnode, tracepoint_string("launder"),
page->index, priv);
- ret = afs_store_data(mapping, page->index, page->index, t, f);
+ ret = afs_store_data(vnode, &iter, (loff_t)page->index << PAGE_SHIFT,
+ page->index, page->index);
}

trace_afs_page_dirty(vnode, tracepoint_string("laundered"),
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 518b9489ff9e..0b744a117dde 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -1216,25 +1216,19 @@ static const struct afs_call_type yfs_RXYFSStoreData64 = {
/*
* Store a set of pages to a large file.
*/
-int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
- pgoff_t first, pgoff_t last,
- unsigned offset, unsigned to,
+int yfs_fs_store_data(struct afs_fs_cursor *fc, struct iov_iter *iter, loff_t pos,
struct afs_status_cb *scb)
{
struct afs_vnode *vnode = fc->vnode;
struct afs_call *call;
struct afs_net *net = afs_v2net(vnode);
- loff_t size, pos, i_size;
+ loff_t size, i_size;
__be32 *bp;

_enter(",%x,{%llx:%llu},,",
key_serial(fc->key), vnode->fid.vid, vnode->fid.vnode);

- size = (loff_t)to - (loff_t)offset;
- if (first != last)
- size += (loff_t)(last - first) << PAGE_SHIFT;
- pos = (loff_t)first << PAGE_SHIFT;
- pos += offset;
+ size = iov_iter_count(iter);

i_size = i_size_read(&vnode->vfs_inode);
if (pos + size > i_size)
@@ -1256,12 +1250,7 @@ int yfs_fs_store_data(struct afs_fs_cursor *fc, struct address_space *mapping,
return -ENOMEM;

call->key = fc->key;
- call->mapping = mapping;
- call->first = first;
- call->last = last;
- call->first_offset = offset;
- call->last_to = to;
- call->send_pages = true;
+ call->write_iter = iter;
call->out_scb = scb;

/* marshall the parameters */
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index c612cabbc378..f663cd482abb 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -762,65 +762,52 @@ TRACE_EVENT(afs_call_done,
__entry->rx_call)
);

-TRACE_EVENT(afs_send_pages,
- TP_PROTO(struct afs_call *call, struct msghdr *msg,
- pgoff_t first, pgoff_t last, unsigned int offset),
+TRACE_EVENT(afs_send_data,
+ TP_PROTO(struct afs_call *call, struct msghdr *msg),

- TP_ARGS(call, msg, first, last, offset),
+ TP_ARGS(call, msg),

TP_STRUCT__entry(
__field(unsigned int, call )
- __field(pgoff_t, first )
- __field(pgoff_t, last )
- __field(unsigned int, nr )
- __field(unsigned int, bytes )
- __field(unsigned int, offset )
__field(unsigned int, flags )
+ __field(loff_t, offset )
+ __field(loff_t, count )
),

TP_fast_assign(
__entry->call = call->debug_id;
- __entry->first = first;
- __entry->last = last;
- __entry->nr = msg->msg_iter.nr_segs;
- __entry->bytes = msg->msg_iter.count;
- __entry->offset = offset;
__entry->flags = msg->msg_flags;
+ __entry->offset = msg->msg_iter.iov_offset;
+ __entry->count = iov_iter_count(&msg->msg_iter);
),

- TP_printk(" c=%08x %lx-%lx-%lx b=%x o=%x f=%x",
- __entry->call,
- __entry->first, __entry->first + __entry->nr - 1, __entry->last,
- __entry->bytes, __entry->offset,
+ TP_printk(" c=%08x o=%llx c=%llx f=%x",
+ __entry->call, __entry->offset, __entry->count,
__entry->flags)
);

-TRACE_EVENT(afs_sent_pages,
- TP_PROTO(struct afs_call *call, pgoff_t first, pgoff_t last,
- pgoff_t cursor, int ret),
+TRACE_EVENT(afs_sent_data,
+ TP_PROTO(struct afs_call *call, struct msghdr *msg, int ret),

- TP_ARGS(call, first, last, cursor, ret),
+ TP_ARGS(call, msg, ret),

TP_STRUCT__entry(
__field(unsigned int, call )
- __field(pgoff_t, first )
- __field(pgoff_t, last )
- __field(pgoff_t, cursor )
__field(int, ret )
+ __field(loff_t, offset )
+ __field(loff_t, count )
),

TP_fast_assign(
__entry->call = call->debug_id;
- __entry->first = first;
- __entry->last = last;
- __entry->cursor = cursor;
__entry->ret = ret;
+ __entry->offset = msg->msg_iter.iov_offset;
+ __entry->count = iov_iter_count(&msg->msg_iter);
),

- TP_printk(" c=%08x %lx-%lx c=%lx r=%d",
- __entry->call,
- __entry->first, __entry->last,
- __entry->cursor, __entry->ret)
+ TP_printk(" c=%08x o=%llx c=%llx r=%d",
+ __entry->call, __entry->offset, __entry->count,
+ __entry->ret)
);

TRACE_EVENT(afs_dir_check_failed,