Re: [RFC] Bypass filesystems for reading cached pages
From: Chaitanya Kulkarni
Date: Fri Jun 19 2020 - 15:06:27 EST
On 6/19/20 8:50 AM, Matthew Wilcox wrote:
> This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS.
> The advantage of this patch is that we can avoid taking any filesystem
> lock, as long as the pages being accessed are in the cache (and we don't
> need to readahead any pages into the cache). We also avoid an indirect
> function call in these cases.
>
> I'm sure reusing the name call_read_iter() is the wrong way to go about
> this, but renaming all the callers would make this a larger patch.
> I'm happy to do it if something like this stands a chance of being
> accepted.
>
> Compared to Andreas' patch, I removed the -ECANCELED return value.
> We can happily return 0 from generic_file_buffered_read() and it's less
> code to handle that. I bypass the attempt to read from the page cache
> for O_DIRECT reads, and for inodes which have no cached pages. Hopefully
> this will avoid calling generic_file_buffered_read() for drivers which
> implement read_iter() (although I haven't audited them all to check that
>
> This could go horribly wrong if filesystems rely on doing work in their
> ->read_iter implementation (eg checking i_size after acquiring their
> lock) instead of keeping the page cache uptodate. On the other hand,
> the ->map_pages() method is already called without locks, so filesystems
> should already be prepared for this.
>
> Arguably we could do something similar for writes. I'm a little more
> scared of that patch since filesystems are more likely to want to do
> things to keep their fies in sync for writes.
I did a testing with NVMeOF target file backend with buffered I/O
enabled with your patch and setting the IOCB_CACHED for each I/O ored
'|' with IOCB_NOWAIT calling call_read_iter_cached() [1].
The name was changed from call_read_iter() -> call_read_iter_cached() [2]).
For the file system I've used XFS and device was null_blk with memory
backed so entire file was cached into the DRAM.
Following are the performance numbers :-
IOPS/Bandwidth :-
default-page-cache: read: IOPS=1389k, BW=5424MiB/s (5688MB/s)
default-page-cache: read: IOPS=1381k, BW=5395MiB/s (5657MB/s)
default-page-cache: read: IOPS=1391k, BW=5432MiB/s (5696MB/s)
iocb-cached-page-cache: read: IOPS=1403k, BW=5481MiB/s (5747MB/s)
iocb-cached-page-cache: read: IOPS=1393k, BW=5439MiB/s (5704MB/s)
iocb-cached-page-cache: read: IOPS=1399k, BW=5465MiB/s (5731MB/s)
Submission lat :-
default-page-cache: slat (usec): min=2, max=1076, avg= 3.71,
default-page-cache: slat (usec): min=2, max=489, avg= 3.72,
default-page-cache: slat (usec): min=2, max=1078, avg= 3.70,
iocb-cached-page-cache: slat (usec): min=2, max=1731, avg= 3.70,
iocb-cached-page-cache: slat (usec): min=2, max=2115, avg= 3.69,
iocb-cached-page-cache: slat (usec): min=2, max=3055, avg= 3.70,
CPU :-
default-page-cache: cpu : usr=10.36%, sys=49.29%, ctx=5207722,
default-page-cache: cpu : usr=10.49%, sys=49.15%, ctx=5179714,
default-page-cache: cpu : usr=10.56%, sys=49.22%, ctx=5215474,
iocb-cached-page-cache: cpu : usr=10.26%, sys=49.53%, ctx=5262214,
iocb-cached-page-cache: cpu : usr=10.43%, sys=49.35%, ctx=5222433,
iocb-cached-page-cache: cpu : usr=10.47%, sys=49.59%, ctx=5247344,
Regards,
Chaitanya
[1]
diff --git a/drivers/nvme/target/io-cmd-file.c
b/drivers/nvme/target/io-cmd-file.c
index 0abbefd9925e..02fa272399b6 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -120,6 +120,9 @@ static ssize_t nvmet_file_submit_bvec(struct
nvmet_req *req, loff_t pos,
iocb->ki_filp = req->ns->file;
iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
+ if (rw == READ)
+ return call_read_iter_cached(req->ns->file, iocb, &iter);
+
return call_iter(iocb, &iter);
}
@@ -264,7 +267,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
if (req->ns->buffered_io) {
if (likely(!req->f.mpool_alloc) &&
- nvmet_file_execute_io(req, IOCB_NOWAIT))
+ nvmet_file_execute_io(req,
+ IOCB_NOWAIT |IOCB_CACHED))
return;
nvmet_file_submit_buffered_io(req);
[2]
diff --git a/fs/read_write.c b/fs/read_write.c
index bbfa9b12b15e..d4bf2581ff0b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -401,6 +401,41 @@ int rw_verify_area(int read_write, struct file
*file, const loff_t *ppos, size_t
read_write == READ ? MAY_READ : MAY_WRITE);
}
+ssize_t call_read_iter_cached(struct file *file, struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ ssize_t written, ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ goto uncached;
+ if (!file->f_mapping->nrpages)
+ goto uncached;
+
+ iocb->ki_flags |= IOCB_CACHED;
+ ret = generic_file_buffered_read(iocb, iter, 0);
+ iocb->ki_flags &= ~IOCB_CACHED;
+
+ if (likely(!iov_iter_count(iter)))
+ return ret;
+
+ if (ret == -EAGAIN) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return ret;
+ ret = 0;
+ } else if (ret < 0) {
+ return ret;
+ }
+
+uncached:
+ written = ret;
+
+ ret = file->f_op->read_iter(iocb, iter);
+ if (ret > 0)
+ written += ret;
+
+ return written ? written : ret;
+}
+
static ssize_t new_sync_read(struct file *filp, char __user *buf,
size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6c4ab4dc1cd7..3fc8b00cd140 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@ enum rw_hint {
#define IOCB_SYNC (1 << 5)
#define IOCB_WRITE (1 << 6)
#define IOCB_NOWAIT (1 << 7)
+#define IOCB_CACHED (1 << 8)
struct kiocb {
struct file *ki_filp;
@@ -1901,6 +1902,8 @@ static inline ssize_t call_read_iter(struct file
*file, struct kiocb *kio,
return file->f_op->read_iter(kio, iter);
}
+ssize_t call_read_iter_cached(struct file *, struct kiocb *, struct
iov_iter *);
+
static inline ssize_t call_write_iter(struct file *file, struct kiocb
*kio,
struct iov_iter *iter)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index f0ae9a6308cb..4ee97941a1f2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2028,7 +2028,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
page = find_get_page(mapping, index);
if (!page) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED))
goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
@@ -2038,12 +2038,16 @@ ssize_t generic_file_buffered_read(struct kiocb
*iocb,
goto no_cached_page;
}
if (PageReadahead(page)) {
+ if (iocb->ki_flags & IOCB_CACHED) {
+ put_page(page);
+ goto out;
+ }
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
- if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) {
put_page(page);
goto would_block;
}
--
2.26.0