Re: [PATCH RFC] md: fix is_mddev_idle()
From: Xiao Ni
Date: Wed Apr 09 2025 - 04:46:57 EST
Hi Kuai
I did a test with your patch. It also brings a big sync speed decrease
during the test. These are some test results:
fio --name=read --filename=/dev/md0 --ioengine=libaio --rw=read
--bs=4k --direct=1 --numjobs=1 --runtime=120 --group_reporting
original version:
READ: bw=1845KiB/s (1889kB/s), 1845KiB/s-1845KiB/s
(1889kB/s-1889kB/s), io=216MiB (227MB), run=120053-120053msec
sync speed: about 190MB/s
with my patch:
READ: bw=19.1MiB/s (20.0MB/s), 19.1MiB/s-19.1MiB/s
(20.0MB/s-20.0MB/s), io=2286MiB (2397MB), run=120013-120013msec
sync speed: 80~100MB/sec
with this patch:
READ: bw=20.3MiB/s (21.2MB/s), 20.3MiB/s-20.3MiB/s
(21.2MB/s-21.2MB/s), io=2431MiB (2549MB), run=120001-120001msec
sync speed: about 40MB/sec
fio --name=read --filename=/dev/md0 --ioengine=libaio --rw=read
--bs=4k --direct=1 --numjobs=1 --iodepth=32 --runtime=120
--group_reporting
original version:
READ: bw=9.78MiB/s (10.3MB/s), 9.78MiB/s-9.78MiB/s
(10.3MB/s-10.3MB/s), io=1174MiB (1231MB), run=120001-120001msec
sync speed: ~170MB/S
with my patch:
READ: bw=68.3MiB/s (71.6MB/s), 68.3MiB/s-68.3MiB/s
(71.6MB/s-71.6MB/s), io=8193MiB (8591MB), run=120014-120014msec
sync speed: ~100MB/sec
with this patch:
READ: bw=110MiB/s (115MB/s), 110MiB/s-110MiB/s (115MB/s-115MB/s),
io=12.8GiB (13.8GB), run=120003-120003msec
sync speed: ~25MB/sec
fio --name=write --filename=/dev/md0 --ioengine=libaio --rw=write
--bs=4k --direct=1 --numjobs=1 --iodepth=32 --runtime=120
--group_reporting
original version:
WRITE: bw=1203KiB/s (1232kB/s), 1203KiB/s-1203KiB/s
(1232kB/s-1232kB/s), io=142MiB (149MB), run=120936-120936msec
sync speed: ~170MB/S
with my patch:
WRITE: bw=4994KiB/s (5114kB/s), 4994KiB/s-4994KiB/s
(5114kB/s-5114kB/s), io=590MiB (619MB), run=121076-121076msec
sync speed: 100MB ~ 110MB/sec
with this patch:
WRITE: bw=10.5MiB/s (11.0MB/s), 10.5MiB/s-10.5MiB/s
(11.0MB/s-11.0MB/s), io=1261MiB (1323MB), run=120002-120002msec
sync speed: 13MB/sec
fio --name=randread --filename=/dev/md0 --ioengine=libaio
--rw=randread --random_generator=tausworthe64 --bs=4k --direct=1
--numjobs=1 --runtime=120 --group_reporting
original version:
READ: bw=17.5KiB/s (18.0kB/s), 17.5KiB/s-17.5KiB/s
(18.0kB/s-18.0kB/s), io=2104KiB (2154kB), run=120008-120008msec
sync speed: ~180MB/S
with my patch:
READ: bw=63.5KiB/s (65.0kB/s), 63.5KiB/s-63.5KiB/s
(65.0kB/s-65.0kB/s), io=7628KiB (7811kB), run=120201-120201msec
sync speed: 150MB ~ 160MB/sec
with this patch:
READ: bw=266KiB/s (273kB/s), 266KiB/s-266KiB/s (273kB/s-273kB/s),
io=31.2MiB (32.7MB), run=120001-120001msec
sync speed: about 15MB/sec
The sync speed decreases too much with this patch. As we talked, I'm
good if it's a new project. We can give the upper layer io a high
priority. But md has run for almost 10 years after patch
ac8fa4196d20("md: allow resync to go faster when there is competing
IO."). It's not good to change this (only my thought). I don't think
it's bad that raid5 tells md the io situation (my rfc
https://www.spinics.net/lists/raid/msg79342.html)
Best Regards
Xiao
On Tue, Apr 8, 2025 at 8:50 PM Yu Kuai <yukuai1@xxxxxxxxxxxxxxx> wrote:
>
> From: Yu Kuai <yukuai3@xxxxxxxxxx>
>
> If sync_speed is above speed_min, then is_mddev_idle() will be called
> for each sync IO to check if the array is idle, and inflihgt sync_io
> will be limited to one if the array is not idle.
>
> However, while mkfs.ext4 for a large raid5 array while recovery is in
> progress, it's found that sync_speed is already above speed_min while
> lots of stripes are used for sync IO, causing long delay for mkfs.ext4.
>
> Root cause is the following checking from is_mddev_idle():
>
> t1: submit sync IO: events1 = completed IO - issued sync IO
> t2: submit next sync IO: events2 = completed IO - issued sync IO
> if (events2 - events1 > 64)
>
> For consequence, the more sync IO issued, the less likely checking will
> pass. And when completed normal IO is more than issued sync IO, the
> condition will finally pass and is_mddev_idle() will return false,
> however, last_events will be updated hence is_mddev_idle() can only
> return false once in a while.
>
> Fix this problem by changing the checking as following:
>
> 1) mddev doesn't have normal IO completed;
> 2) mddev doesn't have normal IO inflight;
> 3) if any member disks is partition, and all other partitions doesn't
> have IO completed.
>
> Noted in order to prevent sync speed to drop conspicuously, the inflight
> sync IO above speed_min is also increased from 1 to 8.
>
> Signed-off-by: Yu Kuai <yukuai3@xxxxxxxxxx>
> ---
> block/blk.h | 1 -
> block/genhd.c | 1 +
> drivers/md/md.c | 97 +++++++++++++++++++++++++-----------------
> drivers/md/md.h | 12 +-----
> drivers/md/raid1.c | 3 --
> drivers/md/raid10.c | 9 ----
> drivers/md/raid5.c | 8 ----
> include/linux/blkdev.h | 2 +-
> 8 files changed, 60 insertions(+), 73 deletions(-)
>
> diff --git a/block/blk.h b/block/blk.h
> index 90fa5f28ccab..a78f9df72a83 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -413,7 +413,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
> int blk_dev_init(void);
>
> void update_io_ticks(struct block_device *part, unsigned long now, bool end);
> -unsigned int part_in_flight(struct block_device *part);
>
> static inline void req_set_nomerge(struct request_queue *q, struct request *req)
> {
> diff --git a/block/genhd.c b/block/genhd.c
> index e9375e20d866..0ce35bc88196 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -139,6 +139,7 @@ unsigned int part_in_flight(struct block_device *part)
>
> return inflight;
> }
> +EXPORT_SYMBOL_GPL(part_in_flight);
>
> static void part_in_flight_rw(struct block_device *part,
> unsigned int inflight[2])
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index cefa9cba711b..c65483a33d7a 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -8585,50 +8585,51 @@ void md_cluster_stop(struct mddev *mddev)
> put_cluster_ops(mddev);
> }
>
> -static int is_mddev_idle(struct mddev *mddev, int init)
> +static bool is_rdev_idle(struct md_rdev *rdev, bool init)
> {
> - struct md_rdev *rdev;
> - int idle;
> - int curr_events;
> + int last_events = rdev->last_events;
>
> - idle = 1;
> - rcu_read_lock();
> - rdev_for_each_rcu(rdev, mddev) {
> - struct gendisk *disk = rdev->bdev->bd_disk;
> + if (!bdev_is_partition(rdev->bdev))
> + return true;
>
> - if (!init && !blk_queue_io_stat(disk->queue))
> - continue;
> + rdev->last_events = (int)part_stat_read_accum(rdev->bdev->bd_disk->part0, sectors) -
> + (int)part_stat_read_accum(rdev->bdev, sectors);
>
> - curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
> - atomic_read(&disk->sync_io);
> - /* sync IO will cause sync_io to increase before the disk_stats
> - * as sync_io is counted when a request starts, and
> - * disk_stats is counted when it completes.
> - * So resync activity will cause curr_events to be smaller than
> - * when there was no such activity.
> - * non-sync IO will cause disk_stat to increase without
> - * increasing sync_io so curr_events will (eventually)
> - * be larger than it was before. Once it becomes
> - * substantially larger, the test below will cause
> - * the array to appear non-idle, and resync will slow
> - * down.
> - * If there is a lot of outstanding resync activity when
> - * we set last_event to curr_events, then all that activity
> - * completing might cause the array to appear non-idle
> - * and resync will be slowed down even though there might
> - * not have been non-resync activity. This will only
> - * happen once though. 'last_events' will soon reflect
> - * the state where there is little or no outstanding
> - * resync requests, and further resync activity will
> - * always make curr_events less than last_events.
> - *
> - */
> - if (init || curr_events - rdev->last_events > 64) {
> - rdev->last_events = curr_events;
> - idle = 0;
> - }
> + if (!init && rdev->last_events > last_events)
> + return false;
> +
> + return true;
> +}
> +
> +/*
> + * mddev is idle if following conditions are match since last check:
> + * 1) mddev doesn't have normal IO completed;
> + * 2) mddev doesn't have inflight normal IO;
> + * 3) if any member disk is partition, and other partitions doesn't have IO
> + * completed;
> + *
> + * Noted this checking rely on IO accounting is enabled.
> + */
> +static bool is_mddev_idle(struct mddev *mddev, bool init)
> +{
> + struct md_rdev *rdev;
> + bool idle = true;
> +
> + if (!mddev_is_dm(mddev)) {
> + int last_events = mddev->last_events;
> +
> + mddev->last_events = (int)part_stat_read_accum(mddev->gendisk->part0, sectors);
> + if (!init && (mddev->last_events > last_events ||
> + part_in_flight(mddev->gendisk->part0)))
> + idle = false;
> }
> +
> + rcu_read_lock();
> + rdev_for_each_rcu(rdev, mddev)
> + if (!is_rdev_idle(rdev, init))
> + idle = false;
> rcu_read_unlock();
> +
> return idle;
> }
>
> @@ -8940,6 +8941,21 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
> }
> }
>
> +/*
> + * For raid 456, sync IO is stripe(4k) per IO, for other levels, it's
> + * RESYNC_PAGES(64k) per IO, we limit inflight sync IO for no more than
> + * 8 if sync_speed is above speed_min.
> + */
> +static int get_active_threshold(struct mddev *mddev)
> +{
> + int max_active = 128 * 8;
> +
> + if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
> + max_active = 8 * 8;
> +
> + return max_active;
> +}
> +
> #define SYNC_MARKS 10
> #define SYNC_MARK_STEP (3*HZ)
> #define UPDATE_FREQUENCY (5*60*HZ)
> @@ -8953,6 +8969,7 @@ void md_do_sync(struct md_thread *thread)
> unsigned long update_time;
> sector_t mark_cnt[SYNC_MARKS];
> int last_mark,m;
> + int active_threshold = get_active_threshold(mddev);
> sector_t last_check;
> int skipped = 0;
> struct md_rdev *rdev;
> @@ -9208,14 +9225,14 @@ void md_do_sync(struct md_thread *thread)
> msleep(500);
> goto repeat;
> }
> - if (!is_mddev_idle(mddev, 0)) {
> + if (atomic_read(&mddev->recovery_active) >= active_threshold &&
> + !is_mddev_idle(mddev, 0))
> /*
> * Give other IO more of a chance.
> * The faster the devices, the less we wait.
> */
> wait_event(mddev->recovery_wait,
> !atomic_read(&mddev->recovery_active));
> - }
> }
> }
> pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index dd6a28f5d8e6..6890aa4ac8b4 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -518,6 +518,7 @@ struct mddev {
> * adding a spare
> */
>
> + int last_events; /* IO event timestamp */
> atomic_t recovery_active; /* blocks scheduled, but not written */
> wait_queue_head_t recovery_wait;
> sector_t recovery_cp;
> @@ -714,17 +715,6 @@ static inline int mddev_trylock(struct mddev *mddev)
> }
> extern void mddev_unlock(struct mddev *mddev);
>
> -static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
> -{
> - if (blk_queue_io_stat(bdev->bd_disk->queue))
> - atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
> -}
> -
> -static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
> -{
> - md_sync_acct(bio->bi_bdev, nr_sectors);
> -}
> -
> struct md_personality
> {
> struct md_submodule_head head;
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index e366d0bba792..d422bab77580 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -2376,7 +2376,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
>
> wbio->bi_end_io = end_sync_write;
> atomic_inc(&r1_bio->remaining);
> - md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
>
> submit_bio_noacct(wbio);
> }
> @@ -3049,7 +3048,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
> bio = r1_bio->bios[i];
> if (bio->bi_end_io == end_sync_read) {
> read_targets--;
> - md_sync_acct_bio(bio, nr_sectors);
> if (read_targets == 1)
> bio->bi_opf &= ~MD_FAILFAST;
> submit_bio_noacct(bio);
> @@ -3058,7 +3056,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
> } else {
> atomic_set(&r1_bio->remaining, 1);
> bio = r1_bio->bios[r1_bio->read_disk];
> - md_sync_acct_bio(bio, nr_sectors);
> if (read_targets == 1)
> bio->bi_opf &= ~MD_FAILFAST;
> submit_bio_noacct(bio);
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 6ef65b4d1093..12fb01987ff3 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
>
> atomic_inc(&conf->mirrors[d].rdev->nr_pending);
> atomic_inc(&r10_bio->remaining);
> - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
>
> if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
> tbio->bi_opf |= MD_FAILFAST;
> @@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
> bio_copy_data(tbio, fbio);
> d = r10_bio->devs[i].devnum;
> atomic_inc(&r10_bio->remaining);
> - md_sync_acct(conf->mirrors[d].replacement->bdev,
> - bio_sectors(tbio));
> submit_bio_noacct(tbio);
> }
>
> @@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
> d = r10_bio->devs[1].devnum;
> if (wbio->bi_end_io) {
> atomic_inc(&conf->mirrors[d].rdev->nr_pending);
> - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
> submit_bio_noacct(wbio);
> }
> if (wbio2) {
> atomic_inc(&conf->mirrors[d].replacement->nr_pending);
> - md_sync_acct(conf->mirrors[d].replacement->bdev,
> - bio_sectors(wbio2));
> submit_bio_noacct(wbio2);
> }
> }
> @@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
> r10_bio->sectors = nr_sectors;
>
> if (bio->bi_end_io == end_sync_read) {
> - md_sync_acct_bio(bio, nr_sectors);
> bio->bi_status = 0;
> submit_bio_noacct(bio);
> }
> @@ -4882,7 +4875,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
> r10_bio->sectors = nr_sectors;
>
> /* Now submit the read */
> - md_sync_acct_bio(read_bio, r10_bio->sectors);
> atomic_inc(&r10_bio->remaining);
> read_bio->bi_next = NULL;
> submit_bio_noacct(read_bio);
> @@ -4942,7 +4934,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
> continue;
>
> atomic_inc(&rdev->nr_pending);
> - md_sync_acct_bio(b, r10_bio->sectors);
> atomic_inc(&r10_bio->remaining);
> b->bi_next = NULL;
> submit_bio_noacct(b);
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 6389383166c0..ca5b0e8ba707 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -1240,10 +1240,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
> }
>
> if (rdev) {
> - if (s->syncing || s->expanding || s->expanded
> - || s->replacing)
> - md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
> -
> set_bit(STRIPE_IO_STARTED, &sh->state);
>
> bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
> @@ -1300,10 +1296,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
> submit_bio_noacct(bi);
> }
> if (rrdev) {
> - if (s->syncing || s->expanding || s->expanded
> - || s->replacing)
> - md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
> -
> set_bit(STRIPE_IO_STARTED, &sh->state);
>
> bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 248416ecd01c..da1a161627ba 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -182,7 +182,6 @@ struct gendisk {
> struct list_head slave_bdevs;
> #endif
> struct timer_rand_state *random;
> - atomic_t sync_io; /* RAID */
> struct disk_events *ev;
>
> #ifdef CONFIG_BLK_DEV_ZONED
> @@ -1117,6 +1116,7 @@ static inline long nr_blockdev_pages(void)
>
> extern void blk_io_schedule(void);
>
> +unsigned int part_in_flight(struct block_device *part);
> int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> sector_t nr_sects, gfp_t gfp_mask);
> int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> --
> 2.39.2
>