Re: Raid Array with 3.5Tb

From: Evan Felix
Date: Wed Mar 24 2004 - 16:18:03 EST


Remember the 3.5Tb Array i've been trying to build, i finally got around
to getting some source code changes that seem to work much better.
Attached you will find a patch that fixes the raid5 code to a point
where it seems to re-sync and recover without complaining about maps not
being correct. The patch below is build against a 2.6.3, but will patch
2.6.4 sources as well. At this point i'd like to hear comments,
thoughts on the changes i've made. Some notes:

1. raid0 seems to work fine at 3.5T
2. I have not looked at the raid6 code, but it does not work at 3.5Tb
3. Formatting the array with ext3 takes a very long time, not sure why
yet.

Here is the patch:
diff -urN -X /home/efelix/.cvsignore kernel-source-2.6.3/drivers/md/md.c
kernel-source-2.6.3evan1/drivers/md/md.c
--- kernel-source-2.6.3/drivers/md/md.c 2004-02-19 08:54:51.000000000
+0000
+++ kernel-source-2.6.3evan1/drivers/md/md.c 2004-03-17
21:52:25.000000000 +0000
@@ -3138,13 +3138,14 @@
static void md_do_sync(mddev_t *mddev)
{
mddev_t *mddev2;
- unsigned int max_sectors, currspeed = 0,
- j, window;
+ unsigned int currspeed = 0,
+ window;
+ sector_t max_sectors,j;
unsigned long mark[SYNC_MARKS];
- unsigned long mark_cnt[SYNC_MARKS];
+ sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
struct list_head *tmp;
- unsigned long last_check;
+ sector_t last_check;

/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -3282,7 +3283,7 @@
*/
cond_resched();

- currspeed =
(j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+ currspeed = ((unsigned
long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1)
+1;

if (currspeed > sysctl_speed_limit_min) {
if ((currspeed > sysctl_speed_limit_max) ||
diff -urN -X /home/efelix/.cvsignore
kernel-source-2.6.3/drivers/md/raid5.c
kernel-source-2.6.3evan1/drivers/md/raid5.c
--- kernel-source-2.6.3/drivers/md/raid5.c 2004-02-19 08:54:52.000000000
+0000
+++ kernel-source-2.6.3evan1/drivers/md/raid5.c 2004-03-17
20:46:52.000000000 +0000
@@ -181,7 +181,7 @@

static void raid5_build_block (struct stripe_head *sh, int i);

-static inline void init_stripe(struct stripe_head *sh, unsigned long
sector, int pd_idx)
+static inline void init_stripe(struct stripe_head *sh, sector_t sector,
int pd_idx)
{
raid5_conf_t *conf = sh->raid_conf;
int disks = conf->raid_disks, i;
@@ -218,7 +218,7 @@
insert_hash(conf, sh);
}

-static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned
long sector)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t
sector)
{
struct stripe_head *sh;

@@ -231,7 +231,7 @@
return NULL;
}

-static struct stripe_head *get_active_stripe(raid5_conf_t *conf,
unsigned long sector,
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf,
sector_t sector,
int pd_idx, int noblock)
{
struct stripe_head *sh;
@@ -495,7 +495,7 @@
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
*/
-static unsigned long raid5_compute_sector(sector_t r_sector, unsigned
int raid_disks,
+static sector_t raid5_compute_sector(sector_t r_sector, unsigned int
raid_disks,
unsigned int data_disks, unsigned int * dd_idx,
unsigned int * pd_idx, raid5_conf_t *conf)
{
@@ -556,7 +556,7 @@
/*
* Finally, compute the new sector number
*/
- new_sector = stripe * sectors_per_chunk + chunk_offset;
+ new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
return new_sector;
}

@@ -567,7 +567,7 @@
int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
sector_t new_sector = sh->sector, check;
int sectors_per_chunk = conf->chunk_size >> 9;
- long stripe;
+ sector_t stripe;
int chunk_offset;
int chunk_number, dummy1, dummy2, dd_idx = i;
sector_t r_sector;
@@ -1388,7 +1389,7 @@
unsigned long stripe;
int chunk_offset;
int dd_idx, pd_idx;
- unsigned long first_sector;
+ sector_t first_sector;
int raid_disks = conf->raid_disks;
int data_disks = raid_disks-1;

@@ -1401,7 +1402,7 @@
stripe = x;
BUG_ON(x != stripe);

- first_sector =
raid5_compute_sector(stripe*data_disks*sectors_per_chunk
+ first_sector =
raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
if (sh == NULL) {
diff -urN -X /home/efelix/.cvsignore
kernel-source-2.6.3/include/linux/raid/md_k.h
kernel-source-2.6.3evan1/include/linux/raid/md_k.h
--- kernel-source-2.6.3/include/linux/raid/md_k.h 2004-02-19
08:55:57.000000000 +0000
+++ kernel-source-2.6.3evan1/include/linux/raid/md_k.h 2004-03-10
21:14:39.000000000 +0000
@@ -211,9 +211,9 @@

struct mdk_thread_s *thread; /* management thread */
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
- unsigned long curr_resync; /* blocks scheduled */
+ sector_t curr_resync; /* blocks scheduled */
unsigned long resync_mark; /* a recent timestamp */
- unsigned long resync_mark_cnt;/* blocks written at resync_mark */
+ sector_t resync_mark_cnt;/* blocks written at resync_mark */

/* recovery/resync flags
* NEEDED: we might need to start a resync/recover
-------------------

Evan

On Fri, 2004-01-16 at 03:18, Neil Brown wrote:
> On Thursday January 15, evan.felix@xxxxxxx wrote:
> > I've been attempting to create a large raid 5 device using the linux
> > 2.6.1 kernel, with the Large Block Device configured on. I have in
> the
> > system 16 250G disks. I built an array with mdadm -C -n 15 -x 1
> > /dev/md2 /dev/sd[a-p]
> >
> > The resync/recovery seemed to be going fine, but at some point i
> started
> > seeing:
> >
> > kernel: compute_blocknr: map not correct
> > kernel: compute_blocknr: map not correct
> ...
> >
> > Has anyone else made an array this large? and does anybody have any
> > pointers on where i can start looking at code to fix this?
>
> I would look in drivers/md/raid5.c and compute_blocknr() in that
> file.
>
> I would change "chunk_number" to a sector_t and change:
> chunk_number = stripe * data_disks + i;
> to read
> chunk_number = (sector_t)stripe * data_disks + i;
>
> Possibly "stripe" and "chunk_number" should both be "sector_t", but
> I'm not at all sure.
>
> Please let me know if it helps.
>
> NeilBrown
--
-------------------------
Evan Felix
Administrator of Supercomputer #5 in Top 500, Nov 2003
Environmental Molecular Sciences Laboratory
Pacific Northwest National Laboratory
Operated for the U.S. DOE by Battelle
diff -urN -X /home/efelix/.cvsignore kernel-source-2.6.3/drivers/md/md.c kernel-source-2.6.3evan1/drivers/md/md.c
--- kernel-source-2.6.3/drivers/md/md.c 2004-02-19 08:54:51.000000000 +0000
+++ kernel-source-2.6.3evan1/drivers/md/md.c 2004-03-17 21:52:25.000000000 +0000
@@ -3138,13 +3138,14 @@
static void md_do_sync(mddev_t *mddev)
{
mddev_t *mddev2;
- unsigned int max_sectors, currspeed = 0,
- j, window;
+ unsigned int currspeed = 0,
+ window;
+ sector_t max_sectors,j;
unsigned long mark[SYNC_MARKS];
- unsigned long mark_cnt[SYNC_MARKS];
+ sector_t mark_cnt[SYNC_MARKS];
int last_mark,m;
struct list_head *tmp;
- unsigned long last_check;
+ sector_t last_check;

/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -3282,7 +3283,7 @@
*/
cond_resched();

- currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+ currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1;

if (currspeed > sysctl_speed_limit_min) {
if ((currspeed > sysctl_speed_limit_max) ||
diff -urN -X /home/efelix/.cvsignore kernel-source-2.6.3/drivers/md/raid5.c kernel-source-2.6.3evan1/drivers/md/raid5.c
--- kernel-source-2.6.3/drivers/md/raid5.c 2004-02-19 08:54:52.000000000 +0000
+++ kernel-source-2.6.3evan1/drivers/md/raid5.c 2004-03-17 20:46:52.000000000 +0000
@@ -181,7 +181,7 @@

static void raid5_build_block (struct stripe_head *sh, int i);

-static inline void init_stripe(struct stripe_head *sh, unsigned long sector, int pd_idx)
+static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
{
raid5_conf_t *conf = sh->raid_conf;
int disks = conf->raid_disks, i;
@@ -218,7 +218,7 @@
insert_hash(conf, sh);
}

-static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
{
struct stripe_head *sh;

@@ -231,7 +231,7 @@
return NULL;
}

-static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector,
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
int pd_idx, int noblock)
{
struct stripe_head *sh;
@@ -495,7 +495,7 @@
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
*/
-static unsigned long raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
+static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
unsigned int data_disks, unsigned int * dd_idx,
unsigned int * pd_idx, raid5_conf_t *conf)
{
@@ -556,7 +556,7 @@
/*
* Finally, compute the new sector number
*/
- new_sector = stripe * sectors_per_chunk + chunk_offset;
+ new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
return new_sector;
}

@@ -567,7 +567,7 @@
int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
sector_t new_sector = sh->sector, check;
int sectors_per_chunk = conf->chunk_size >> 9;
- long stripe;
+ sector_t stripe;
int chunk_offset;
int chunk_number, dummy1, dummy2, dd_idx = i;
sector_t r_sector;
@@ -1388,7 +1389,7 @@
unsigned long stripe;
int chunk_offset;
int dd_idx, pd_idx;
- unsigned long first_sector;
+ sector_t first_sector;
int raid_disks = conf->raid_disks;
int data_disks = raid_disks-1;

@@ -1401,7 +1402,7 @@
stripe = x;
BUG_ON(x != stripe);

- first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
+ first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
+ chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
if (sh == NULL) {
diff -urN -X /home/efelix/.cvsignore kernel-source-2.6.3/include/linux/raid/md_k.h kernel-source-2.6.3evan1/include/linux/raid/md_k.h
--- kernel-source-2.6.3/include/linux/raid/md_k.h 2004-02-19 08:55:57.000000000 +0000
+++ kernel-source-2.6.3evan1/include/linux/raid/md_k.h 2004-03-10 21:14:39.000000000 +0000
@@ -211,9 +211,9 @@

struct mdk_thread_s *thread; /* management thread */
struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */
- unsigned long curr_resync; /* blocks scheduled */
+ sector_t curr_resync; /* blocks scheduled */
unsigned long resync_mark; /* a recent timestamp */
- unsigned long resync_mark_cnt;/* blocks written at resync_mark */
+ sector_t resync_mark_cnt;/* blocks written at resync_mark */

/* recovery/resync flags
* NEEDED: we might need to start a resync/recover