[PATCH 2/2] md/raid5: exclusive wait_for_stripe

From: Yuanhan Liu
Date: Fri Apr 24 2015 - 09:37:47 EST


I noticed heavy spin lock contention at get_active_stripe() with fsmark
multiple thread write workloads.

Here is how this hot contention comes from. We have limited stripes, and
it's a multiple thread write workload. Hence, those stripes will be taken
soon, which puts later processes to sleep for waiting free stripes. When
enough stripes(> 1/4 total stripes) are released, all process are woken,
trying to get the lock. But there is one only being able to get this lock
for each hash lock, making other processes spinning out there for acquiring
the lock.

Thus, it's effectiveless to wakeup all processes and let them battle for
a lock that permits one to access only each time. Instead, we could make
it be a exclusive wake up: wake up one process only. That avoids the heavy
spin lock contention naturally.

Here are some test results I have got with this patch applied(all test run
3 times):

`fsmark.files_per_sec'
=====================

next-20150317 this patch
------------------------- -------------------------
metric_value Âstddev metric_value Âstddev change testbox/benchmark/testcase-params
------------------------- ------------------------- -------- ------------------------------
25.600 Â0.0 92.700 Â2.5 262.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
25.600 Â0.0 77.800 Â0.6 203.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
32.000 Â0.0 93.800 Â1.7 193.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
32.000 Â0.0 81.233 Â1.7 153.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
48.800 Â14.5 99.667 Â2.0 104.2% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
6.400 Â0.0 12.800 Â0.0 100.0% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
63.133 Â8.2 82.800 Â0.7 31.2% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
245.067 Â0.7 306.567 Â7.9 25.1% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
17.533 Â0.3 21.000 Â0.8 19.8% ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
188.167 Â1.9 215.033 Â3.1 14.3% ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
254.500 Â1.8 290.733 Â2.4 14.2% ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync

`time.system_time'
=====================

next-20150317 this patch
------------------------- -------------------------
metric_value Âstddev metric_value Âstddev change testbox/benchmark/testcase-params
------------------------- ------------------------- -------- ------------------------------
7235.603 Â1.2 185.163 Â1.9 -97.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
7666.883 Â2.9 202.750 Â1.0 -97.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
14567.893 Â0.7 421.230 Â0.4 -97.1% ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
3697.667 Â14.0 148.190 Â1.7 -96.0% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
5572.867 Â3.8 310.717 Â1.4 -94.4% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
5565.050 Â0.5 313.277 Â1.5 -94.4% ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
2420.707 Â17.1 171.043 Â2.7 -92.9% ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
3743.300 Â4.6 379.827 Â3.5 -89.9% ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose
3308.687 Â6.3 363.050 Â2.0 -89.0% ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose

Where,

1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark

1t, 64t: where 't' means thread

4M: means the single file size, corresponding to the '-s' option of fsmark
40G, 30G, 120G: means the total test size

4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means
the size of one ramdisk. So, it would be 48G in total. And we made a
raid on those ramdisk

As you can see, though there are no much performance gain for hard disk
workload, the system time is dropped heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

Signed-off-by: Yuanhan Liu <yuanhan.liu@xxxxxxxxxxxxxxx>
---
drivers/md/raid5.c | 46 +++++++++++++++++++++++++++++++++++-----------
drivers/md/raid5.h | 2 +-
2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b7e385f..2d8fcc1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf,
int hash)
{
int size;
- bool do_wakeup = false;
+ bool do_wakeup[NR_STRIPE_HASH_LOCKS] = { false, };
+ int i = 0;
unsigned long flags;

if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,17 +366,22 @@ static void release_inactive_stripe_list(struct r5conf *conf,
!list_empty(list))
atomic_dec(&conf->empty_inactive_list_nr);
list_splice_tail_init(list, conf->inactive_list + hash);
- do_wakeup = true;
+ do_wakeup[size - 1] = true;
spin_unlock_irqrestore(conf->hash_locks + hash, flags);
}
size--;
hash--;
}

- if (do_wakeup) {
- wake_up(&conf->wait_for_stripe);
- if (conf->retry_read_aligned)
- md_wakeup_thread(conf->mddev->thread);
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+ bool waked_thread = false;
+ if (do_wakeup[i]) {
+ wake_up(&conf->wait_for_stripe[i]);
+ if (!waked_thread) {
+ waked_thread = true;
+ md_wakeup_thread(conf->mddev->thread);
+ }
+ }
}
}

@@ -655,6 +661,18 @@ static int has_failed(struct r5conf *conf)
return 0;
}

+/* XXX: might put it to linux/wait.h to be a public API? */
+#define raid_wait_event_exclusive_cmd(wq, condition, cmd1, cmd2) \
+do { \
+ if (condition) \
+ break; \
+ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 1, 0, \
+ cmd1; \
+ schedule(); \
+ cmd2); \
+} while (0)
+
+
static struct stripe_head *
get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
@@ -684,14 +702,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
if (!sh) {
set_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
- wait_event_lock_irq(
- conf->wait_for_stripe,
+ raid_wait_event_exclusive_cmd(
+ conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash) &&
(atomic_read(&conf->active_stripes)
< (conf->max_nr_stripes * 3 / 4)
|| !test_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state)),
- *(conf->hash_locks + hash));
+ spin_unlock_irq(conf->hash_locks + hash),
+ spin_lock_irq(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED,
&conf->cache_state);
} else {
@@ -716,6 +735,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
}
} while (sh == NULL);

+ if (!list_empty(conf->inactive_list + hash))
+ wake_up(&conf->wait_for_stripe[hash]);
+
spin_unlock_irq(conf->hash_locks + hash);
return sh;
}
@@ -2136,7 +2158,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
cnt = 0;
list_for_each_entry(nsh, &newstripes, lru) {
lock_device_hash_lock(conf, hash);
- wait_event_cmd(conf->wait_for_stripe,
+ raid_wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
!list_empty(conf->inactive_list + hash),
unlock_device_hash_lock(conf, hash),
lock_device_hash_lock(conf, hash));
@@ -6391,7 +6413,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
spin_lock_init(&conf->device_lock);
seqcount_init(&conf->gen_lock);
init_waitqueue_head(&conf->wait_for_quiesce);
- init_waitqueue_head(&conf->wait_for_stripe);
+ for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+ init_waitqueue_head(&conf->wait_for_stripe[i]);
+ }
init_waitqueue_head(&conf->wait_for_overlap);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index fab53a3..cdad2d2 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,7 +509,7 @@ struct r5conf {
atomic_t empty_inactive_list_nr;
struct llist_head released_stripes;
wait_queue_head_t wait_for_quiesce;
- wait_queue_head_t wait_for_stripe;
+ wait_queue_head_t wait_for_stripe[NR_STRIPE_HASH_LOCKS];
wait_queue_head_t wait_for_overlap;
unsigned long cache_state;
#define R5_INACTIVE_BLOCKED 1 /* release of inactive stripes blocked,
--
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/