[PATCH 07/12] iomap: allow holding i_rwsem until aio completion

From: Christoph Hellwig
Date: Tue Jan 14 2020 - 11:13:22 EST


The direct I/O code currently uses a hand crafted i_dio_count that needs
to be incremented under i_rwsem and then is decremented when I/O
completes. That scheme means file system code needs to be very careful
to wait for i_dio_count to reach zero under i_rwsem in various places
that are very cumbersome to get rid. It also means we can't get the
effect of an exclusive i_rwsem for actually asynchronous I/O, forcing
pointless synchronous execution of sub-blocksize writes.

Replace the i_dio_count scheme with holding i_rwsem over the duration
of the whole I/O. While this introduces a non-owner unlock that isn't
nice to RT workload, the open coded locking primitive using i_dio_count
isn't any better.

Signed-off-by: Christoph Hellwig <hch@xxxxxx>
---
fs/iomap/direct-io.c | 44 +++++++++++++++++++++++++++++++++++++------
include/linux/iomap.h | 2 ++
2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index e706329d71a0..0113ac33b0a0 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -70,7 +70,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
dio->submit.cookie = submit_bio(bio);
}

-static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+static ssize_t iomap_dio_complete(struct iomap_dio *dio, bool unlock)
{
const struct iomap_dio_ops *dops = dio->dops;
struct kiocb *iocb = dio->iocb;
@@ -112,6 +112,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
dio_warn_stale_pagecache(iocb->ki_filp);
}

+ if (unlock) {
+ if (dio->flags & IOMAP_DIO_RWSEM_EXCL)
+ up_write(&inode->i_rwsem);
+ else if (dio->flags & IOMAP_DIO_RWSEM_SHARED)
+ up_read(&inode->i_rwsem);
+ }
+
/*
* If this is a DSYNC write, make sure we push it to stable storage now
* that we've written data.
@@ -129,8 +136,22 @@ static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
+ struct inode *inode = file_inode(iocb->ki_filp);

- iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+ /*
+ * XXX: For reads this code is directly called from bio ->end_io, which
+ * often is hard or softirq context. In that case lockdep records the
+ * below as lock acquisitions from irq context and causes warnings.
+ */
+ if (dio->flags & IOMAP_DIO_RWSEM_EXCL) {
+ rwsem_acquire(&inode->i_rwsem.dep_map, 0, 0, _THIS_IP_);
+ if (IS_ENABLED(CONFIG_RWSEM_SPIN_ON_OWNER))
+ atomic_long_set(&inode->i_rwsem.owner, (long)current);
+ } else if (dio->flags & IOMAP_DIO_RWSEM_SHARED) {
+ rwsem_acquire_read(&inode->i_rwsem.dep_map, 0, 0, _THIS_IP_);
+ }
+
+ iocb->ki_complete(iocb, iomap_dio_complete(dio, true), 0);
}

/*
@@ -430,7 +451,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->i_size = i_size_read(inode);
dio->dops = dops;
dio->error = 0;
- dio->flags = 0;
+ dio->flags = dio_flags;

dio->submit.iter = iter;
dio->submit.waiter = current;
@@ -551,8 +572,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->wait_for_completion = wait_for_completion;
if (!atomic_dec_and_test(&dio->ref)) {
if (!wait_for_completion)
- return -EIOCBQUEUED;
-
+ goto async_completion;
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(dio->submit.waiter))
@@ -567,10 +587,22 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
__set_current_state(TASK_RUNNING);
}

- return iomap_dio_complete(dio);
+ return iomap_dio_complete(dio, false);

out_free_dio:
kfree(dio);
return ret;
+
+async_completion:
+ /*
+ * We are returning to userspace now, but i_rwsem is still held until
+ * the I/O completion comes back.
+ */
+ if (dio_flags & (IOMAP_DIO_RWSEM_EXCL | IOMAP_DIO_RWSEM_SHARED))
+ rwsem_release(&inode->i_rwsem.dep_map, _THIS_IP_);
+ if ((dio_flags & IOMAP_DIO_RWSEM_EXCL) &&
+ IS_ENABLED(CONFIG_RWSEM_SPIN_ON_OWNER))
+ atomic_long_set(&inode->i_rwsem.owner, RWSEM_OWNER_UNKNOWN);
+ return -EIOCBQUEUED;
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 3faeb8fd0961..f259bb979d7f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -249,6 +249,8 @@ int iomap_writepages(struct address_space *mapping,
#define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */
#define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */
#define IOMAP_DIO_SYNCHRONOUS (1 << 2) /* no async completion */
+#define IOMAP_DIO_RWSEM_EXCL (1 << 3) /* holds shared i_rwsem */
+#define IOMAP_DIO_RWSEM_SHARED (1 << 4) /* holds exclusive i_rwsem */

struct iomap_dio_ops {
int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
--
2.24.1