From: Li Nan <linan122@xxxxxxxxxx>I'll suggest not to fix read error for replacement, it's better to error
In fix_read_error(), the handling of replacement devices is missing. If
read replacement device errors, we will attempt to fix 'mirror->rdev'.
It is wrong. Get rdev from r10bio to ensure that the fixed device is the
one which read error occurred.
Signed-off-by: Li Nan <linan122@xxxxxxxxxx>
---
drivers/md/raid10.c | 32 +++++++++++++++++---------------
1 file changed, 17 insertions(+), 15 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a36e53fce21f..4a7c8eaf6ea0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2726,15 +2726,10 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
{
int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
- struct md_rdev *rdev;
+ struct md_rdev *rdev = r10_bio->devs[slot].rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[slot].devnum;
- /* still own a reference to this rdev, so it cannot
- * have been cleared recently.
- */
- rdev = conf->mirrors[d].rdev;
-
if (test_bit(Faulty, &rdev->flags))
/* drive has already been failed, just ignore any
more fix_read_error() attempts */
@@ -2763,12 +2758,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
s = PAGE_SIZE >> 9;
rcu_read_lock();
+ rdev = r10_bio->devs[slot].rdev;
do {
sector_t first_bad;
int bad_sectors;
- d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
@@ -2790,6 +2784,8 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
sl++;
if (sl == conf->copies)
sl = 0;
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
} while (sl != slot);
rcu_read_unlock();
@@ -2798,9 +2794,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
* as bad on the first device to discourage future
* reads.
*/
- int dn = r10_bio->devs[slot].devnum;
- rdev = conf->mirrors[dn].rdev;
-
+ rdev = r10_bio->devs[slot].rdev;
if (!rdev_set_badblocks(
rdev,
r10_bio->devs[slot].addr
@@ -2820,8 +2814,12 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
if (sl==0)
sl = conf->copies;
sl--;
- d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (sl == slot) {
+ rdev = r10_bio->devs[slot].rdev;
+ } else {
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ }
if (!rdev ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
@@ -2854,8 +2852,12 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
if (sl==0)
sl = conf->copies;
sl--;
- d = r10_bio->devs[sl].devnum;
- rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (sl == slot) {
+ rdev = r10_bio->devs[slot].rdev;
+ } else {
+ d = r10_bio->devs[sl].devnum;
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
+ }
if (!rdev ||
test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))