[PATCH 5.0 034/139] md: batch flush requests.

From: Greg Kroah-Hartman
Date: Thu May 23 2019 - 15:24:59 EST


From: NeilBrown <neilb@xxxxxxxx>

commit 2bc13b83e6298486371761de503faeffd15b7534 upstream.

Currently if many flush requests are submitted to an md device is quick
succession, they are serialized and can take a long to process them all.
We don't really need to call flush all those times - a single flush call
can satisfy all requests submitted before it started.
So keep track of when the current flush started and when it finished,
allow any pending flush that was requested before the flush started
to complete without waiting any more.

Test results from Xiao:

Test is done on a raid10 device which is created by 4 SSDs. The tool is
dbench.

1. The latest linux stable kernel
Operation Count AvgLat MaxLat
--------------------------------------------------
Deltree 768 10.509 78.305
Flush 2078376 0.013 10.094
Close 21787697 0.019 18.821
LockX 96580 0.007 3.184
Mkdir 384 0.008 0.062
Rename 1255883 0.191 23.534
ReadX 46495589 0.020 14.230
WriteX 14790591 7.123 60.706
Unlink 5989118 0.440 54.551
UnlockX 96580 0.005 2.736
FIND_FIRST 10393845 0.042 12.079
SET_FILE_INFORMATION 2415558 0.129 10.088
QUERY_FILE_INFORMATION 4711725 0.005 8.462
QUERY_PATH_INFORMATION 26883327 0.032 21.715
QUERY_FS_INFORMATION 4929409 0.010 8.238
NTCreateX 29660080 0.100 53.268

Throughput 1034.88 MB/sec (sync open) 128 clients 128 procs
max_latency=60.712 ms

2. With patch1 "Revert "MD: fix lock contention for flush bios""
Operation Count AvgLat MaxLat
--------------------------------------------------
Deltree 256 8.326 36.761
Flush 693291 3.974 180.269
Close 7266404 0.009 36.929
LockX 32160 0.006 0.840
Mkdir 128 0.008 0.021
Rename 418755 0.063 29.945
ReadX 15498708 0.007 7.216
WriteX 4932310 22.482 267.928
Unlink 1997557 0.109 47.553
UnlockX 32160 0.004 1.110
FIND_FIRST 3465791 0.036 7.320
SET_FILE_INFORMATION 805825 0.015 1.561
QUERY_FILE_INFORMATION 1570950 0.005 2.403
QUERY_PATH_INFORMATION 8965483 0.013 14.277
QUERY_FS_INFORMATION 1643626 0.009 3.314
NTCreateX 9892174 0.061 41.278

Throughput 345.009 MB/sec (sync open) 128 clients 128 procs
max_latency=267.939 m

3. With patch1 and patch2
Operation Count AvgLat MaxLat
--------------------------------------------------
Deltree 768 9.570 54.588
Flush 2061354 0.666 15.102
Close 21604811 0.012 25.697
LockX 95770 0.007 1.424
Mkdir 384 0.008 0.053
Rename 1245411 0.096 12.263
ReadX 46103198 0.011 12.116
WriteX 14667988 7.375 60.069
Unlink 5938936 0.173 30.905
UnlockX 95770 0.005 4.147
FIND_FIRST 10306407 0.041 11.715
SET_FILE_INFORMATION 2395987 0.048 7.640
QUERY_FILE_INFORMATION 4672371 0.005 9.291
QUERY_PATH_INFORMATION 26656735 0.018 19.719
QUERY_FS_INFORMATION 4887940 0.010 7.654
NTCreateX 29410811 0.059 28.551

Throughput 1026.21 MB/sec (sync open) 128 clients 128 procs
max_latency=60.075 ms

Cc: <stable@xxxxxxxxxxxxxxx> # v4.19+
Tested-by: Xiao Ni <xni@xxxxxxxxxx>
Signed-off-by: NeilBrown <neilb@xxxxxxxx>
Signed-off-by: Song Liu <songliubraving@xxxxxx>
Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>

---
drivers/md/md.c | 27 +++++++++++++++++++++++----
drivers/md/md.h | 3 +++
2 files changed, 26 insertions(+), 4 deletions(-)

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -427,6 +427,7 @@ static void submit_flushes(struct work_s
struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;

+ mddev->start_flush = ktime_get_boottime();
INIT_WORK(&mddev->flush_work, md_submit_flush_data);
atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
@@ -467,6 +468,7 @@ static void md_submit_flush_data(struct
* could wait for this and below md_handle_request could wait for those
* bios because of suspend check
*/
+ mddev->last_flush = mddev->start_flush;
mddev->flush_bio = NULL;
wake_up(&mddev->sb_wait);

@@ -481,15 +483,32 @@ static void md_submit_flush_data(struct

void md_flush_request(struct mddev *mddev, struct bio *bio)
{
+ ktime_t start = ktime_get_boottime();
spin_lock_irq(&mddev->lock);
wait_event_lock_irq(mddev->sb_wait,
- !mddev->flush_bio,
+ !mddev->flush_bio ||
+ ktime_after(mddev->last_flush, start),
mddev->lock);
- mddev->flush_bio = bio;
+ if (!ktime_after(mddev->last_flush, start)) {
+ WARN_ON(mddev->flush_bio);
+ mddev->flush_bio = bio;
+ bio = NULL;
+ }
spin_unlock_irq(&mddev->lock);

- INIT_WORK(&mddev->flush_work, submit_flushes);
- queue_work(md_wq, &mddev->flush_work);
+ if (!bio) {
+ INIT_WORK(&mddev->flush_work, submit_flushes);
+ queue_work(md_wq, &mddev->flush_work);
+ } else {
+ /* flush was performed for some other bio while we waited. */
+ if (bio->bi_iter.bi_size == 0)
+ /* an empty barrier - all done */
+ bio_endio(bio);
+ else {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+ mddev->pers->make_request(mddev, bio);
+ }
+ }
}
EXPORT_SYMBOL(md_flush_request);

--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -463,6 +463,9 @@ struct mddev {
*/
struct bio *flush_bio;
atomic_t flush_pending;
+ ktime_t start_flush, last_flush; /* last_flush is when the last completed
+ * flush was started.
+ */
struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);