Re: [PATCH 135/241] md/raid10: close race that lose writes lostwhen replacement completes.

From: Ben Hutchings
Date: Sat Dec 15 2012 - 13:40:34 EST


On Thu, 2012-12-13 at 11:58 -0200, Herton Ronaldo Krzesinski wrote:
> 3.5.7.2 -stable review patch. If anyone has any objections, please let me know.
>
> ------------------
>
> From: NeilBrown <neilb@xxxxxxx>
>
> commit e7c0c3fa29280d62aa5e11101a674bb3064bd791 upstream.
>
> When a replacement operation completes there is a small window
> when the original device is marked 'faulty' and the replacement
> still looks like a replacement. The faulty should be removed and
> the replacement moved in place very quickly, bit it isn't instant.
>
> So the code write out to the array must handle the possibility that
> the only working device for some slot in the replacement - but it
> doesn't. If the primary device is faulty it just gives up. This
> can lead to corruption.
>
> So make the code more robust: if either the primary or the
> replacement is present and working, write to them. Only when
> neither are present do we give up.
>
> This bug has been present since replacement was introduced in
> 3.3, so it is suitable for any -stable kernel since then.

This is missing from 3.4, so Greg will presumably want to apply this (if
the backport is correct).

Ben.

> Reported-by: "George Spelvin" <linux@xxxxxxxxxxx>
> Signed-off-by: NeilBrown <neilb@xxxxxxx>
> [ herton: hairy code adjustment on 3rd hunk (conf->copies for loop) ]
> Signed-off-by: Herton Ronaldo Krzesinski <herton.krzesinski@xxxxxxxxxxxxx>
> ---
> drivers/md/raid10.c | 113 +++++++++++++++++++++++++++------------------------
> 1 file changed, 59 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 17fae37..0920adf 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1267,18 +1267,21 @@ retry_write:
> blocked_rdev = rrdev;
> break;
> }
> + if (rdev && (test_bit(Faulty, &rdev->flags)
> + || test_bit(Unmerged, &rdev->flags)))
> + rdev = NULL;
> if (rrdev && (test_bit(Faulty, &rrdev->flags)
> || test_bit(Unmerged, &rrdev->flags)))
> rrdev = NULL;
>
> r10_bio->devs[i].bio = NULL;
> r10_bio->devs[i].repl_bio = NULL;
> - if (!rdev || test_bit(Faulty, &rdev->flags) ||
> - test_bit(Unmerged, &rdev->flags)) {
> +
> + if (!rdev && !rrdev) {
> set_bit(R10BIO_Degraded, &r10_bio->state);
> continue;
> }
> - if (test_bit(WriteErrorSeen, &rdev->flags)) {
> + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
> sector_t first_bad;
> sector_t dev_sector = r10_bio->devs[i].addr;
> int bad_sectors;
> @@ -1320,8 +1323,10 @@ retry_write:
> max_sectors = good_sectors;
> }
> }
> - r10_bio->devs[i].bio = bio;
> - atomic_inc(&rdev->nr_pending);
> + if (rdev) {
> + r10_bio->devs[i].bio = bio;
> + atomic_inc(&rdev->nr_pending);
> + }
> if (rrdev) {
> r10_bio->devs[i].repl_bio = bio;
> atomic_inc(&rrdev->nr_pending);
> @@ -1377,58 +1382,58 @@ retry_write:
> for (i = 0; i < conf->copies; i++) {
> struct bio *mbio;
> int d = r10_bio->devs[i].devnum;
> - if (!r10_bio->devs[i].bio)
> - continue;
> -
> - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> - max_sectors);
> - r10_bio->devs[i].bio = mbio;
> -
> - mbio->bi_sector = (r10_bio->devs[i].addr+
> - choose_data_offset(r10_bio,
> - conf->mirrors[d].rdev));
> - mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
> - mbio->bi_end_io = raid10_end_write_request;
> - mbio->bi_rw = WRITE | do_sync | do_fua;
> - mbio->bi_private = r10_bio;
> -
> - atomic_inc(&r10_bio->remaining);
> - spin_lock_irqsave(&conf->device_lock, flags);
> - bio_list_add(&conf->pending_bio_list, mbio);
> - conf->pending_count++;
> - spin_unlock_irqrestore(&conf->device_lock, flags);
> - if (!mddev_check_plugged(mddev))
> - md_wakeup_thread(mddev->thread);
> -
> - if (!r10_bio->devs[i].repl_bio)
> - continue;
> + if (r10_bio->devs[i].bio) {
> + struct md_rdev *rdev = conf->mirrors[d].rdev;
> + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> + max_sectors);
> + r10_bio->devs[i].bio = mbio;
> +
> + mbio->bi_sector = (r10_bio->devs[i].addr+
> + choose_data_offset(r10_bio,
> + rdev));
> + mbio->bi_bdev = rdev->bdev;
> + mbio->bi_end_io = raid10_end_write_request;
> + mbio->bi_rw = WRITE | do_sync | do_fua;
> + mbio->bi_private = r10_bio;
>
> - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> - max_sectors);
> - r10_bio->devs[i].repl_bio = mbio;
> + atomic_inc(&r10_bio->remaining);
> + spin_lock_irqsave(&conf->device_lock, flags);
> + bio_list_add(&conf->pending_bio_list, mbio);
> + conf->pending_count++;
> + spin_unlock_irqrestore(&conf->device_lock, flags);
> + if (!mddev_check_plugged(mddev))
> + md_wakeup_thread(mddev->thread);
> + }
>
> - /* We are actively writing to the original device
> - * so it cannot disappear, so the replacement cannot
> - * become NULL here
> - */
> - mbio->bi_sector = (r10_bio->devs[i].addr +
> - choose_data_offset(
> - r10_bio,
> - conf->mirrors[d].replacement));
> - mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
> - mbio->bi_end_io = raid10_end_write_request;
> - mbio->bi_rw = WRITE | do_sync | do_fua;
> - mbio->bi_private = r10_bio;
> + if (r10_bio->devs[i].repl_bio) {
> + struct md_rdev *rdev = conf->mirrors[d].replacement;
> + if (rdev == NULL) {
> + /* Replacement just got moved to main 'rdev' */
> + smp_mb();
> + rdev = conf->mirrors[d].rdev;
> + }
> + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> + max_sectors);
> + r10_bio->devs[i].repl_bio = mbio;
> +
> + mbio->bi_sector = (r10_bio->devs[i].addr +
> + choose_data_offset(
> + r10_bio, rdev));
> + mbio->bi_bdev = rdev->bdev;
> + mbio->bi_end_io = raid10_end_write_request;
> + mbio->bi_rw = WRITE | do_sync | do_fua;
> + mbio->bi_private = r10_bio;
>
> - atomic_inc(&r10_bio->remaining);
> - spin_lock_irqsave(&conf->device_lock, flags);
> - bio_list_add(&conf->pending_bio_list, mbio);
> - conf->pending_count++;
> - spin_unlock_irqrestore(&conf->device_lock, flags);
> - if (!mddev_check_plugged(mddev))
> - md_wakeup_thread(mddev->thread);
> + atomic_inc(&r10_bio->remaining);
> + spin_lock_irqsave(&conf->device_lock, flags);
> + bio_list_add(&conf->pending_bio_list, mbio);
> + conf->pending_count++;
> + spin_unlock_irqrestore(&conf->device_lock, flags);
> + if (!mddev_check_plugged(mddev))
> + md_wakeup_thread(mddev->thread);
> + }
> }
>
> /* Don't remove the bias on 'remaining' (one_write_done) until

--
Ben Hutchings
Theory and practice are closer in theory than in practice.
- John Levine, moderator of comp.compilers

Attachment: signature.asc
Description: This is a digitally signed message part