Re: Is Swapping on software RAID1 possible in linux 2.4 ?

From: Andrew Morton (andrewm@uow.edu.au)
Date: Wed Jul 11 2001 - 20:48:19 EST


Neil Brown wrote:
>
> Also, can you find out what that process is doing when it is
> unkillable.
> If you compile with alt-sysrq support, then alt-sysrq-t should print
> the process table. If you can get this out of dmesg and run if though
> ksymoops it might be most interesting.

Neil, he showed us a trace the other day - kswapd was
stuck in raid1_alloc_r1_bh(). This is basically the
same situation as I had yesterday, where bdflush was stuck
in the same place.

It is completely fatal to the VM for these two processes to
get stuck in this way. The approach I took was to beef up
the reserved bh queues and to keep a number of them
reserved *only* for the swapout and dirty buffer flush functions.
That way, we have at hand the memory we need to be able to
free up memory.

It was necessary to define a new task_struct.flags bit so we
can identify when the caller is a `buffer flusher' - I expect
we'll need that in other places as well.

An easy way to demonstrate the problem is to put ext3 on RAID1,
boot with `mem=64m' and run `dd if=/dev/zero of=foo bs=1024k count=1k'.
The machine wedges on the first run. This is due to a bdflush deadlock.
Once swap is on RAID1, there will be kswapd deadlocks as well. The
patch *should* fix those, but I haven't tested that.

Could you please review these changes?

BTW: I removed the initial buffer_head reservation code. It's
not necessary with the modified reservation algorithm - as soon
as we start to use the device the reserve pools will build
up. There will be a deadlock opportunity if the machine is totally
and utterly oom when the RAID device initially starts up, but it's
really not worth the code space to even bother about this.

--- linux-2.4.6/include/linux/sched.h Wed May 2 22:00:07 2001
+++ lk-ext3/include/linux/sched.h Thu Jul 12 01:03:20 2001
@@ -413,7 +418,7 @@ struct task_struct {
 #define PF_SIGNALED 0x00000400 /* killed by a signal */
 #define PF_MEMALLOC 0x00000800 /* Allocating memory */
 #define PF_VFORK 0x00001000 /* Wake up parent in mm_release */
-
+#define PF_FLUSH 0x00002000 /* Flushes buffers to disk */
 #define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */
 
 /*
--- linux-2.4.6/include/linux/raid/raid1.h Tue Dec 12 08:20:08 2000
+++ lk-ext3/include/linux/raid/raid1.h Thu Jul 12 01:15:39 2001
@@ -37,12 +37,12 @@ struct raid1_private_data {
         /* buffer pool */
         /* buffer_heads that we have pre-allocated have b_pprev -> &freebh
          * and are linked into a stack using b_next
- * raid1_bh that are pre-allocated have R1BH_PreAlloc set.
          * All these variable are protected by device_lock
          */
         struct buffer_head *freebh;
         int freebh_cnt; /* how many are on the list */
         struct raid1_bh *freer1;
+ unsigned freer1_cnt;
         struct raid1_bh *freebuf; /* each bh_req has a page allocated */
         md_wait_queue_head_t wait_buffer;
 
@@ -87,5 +87,4 @@ struct raid1_bh {
 /* bits for raid1_bh.state */
 #define R1BH_Uptodate 1
 #define R1BH_SyncPhase 2
-#define R1BH_PreAlloc 3 /* this was pre-allocated, add to free list */
 #endif
--- linux-2.4.6/fs/buffer.c Wed Jul 4 18:21:31 2001
+++ lk-ext3/fs/buffer.c Thu Jul 12 01:03:57 2001
@@ -2685,6 +2748,7 @@ int bdflush(void *sem)
         sigfillset(&tsk->blocked);
         recalc_sigpending(tsk);
         spin_unlock_irq(&tsk->sigmask_lock);
+ current->flags |= PF_FLUSH;
 
         up((struct semaphore *)sem);
 
@@ -2726,6 +2790,7 @@ int kupdate(void *sem)
         siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
         recalc_sigpending(tsk);
         spin_unlock_irq(&tsk->sigmask_lock);
+ current->flags |= PF_FLUSH;
 
         up((struct semaphore *)sem);
 
--- linux-2.4.6/drivers/md/raid1.c Wed Jul 4 18:21:26 2001
+++ lk-ext3/drivers/md/raid1.c Thu Jul 12 01:28:58 2001
@@ -51,6 +51,28 @@ static mdk_personality_t raid1_personali
 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
 
+/*
+ * We need to scale the number of reserved buffers by the page size
+ * to make writepage()s sucessful. --akpm
+ */
+#define R1_BLOCKS_PP (PAGE_CACHE_SIZE / 1024)
+#define FREER1_MEMALLOC_RESERVED (16 * R1_BLOCKS_PP)
+
+/*
+ * Return true if the caller make take a bh from the list.
+ * PF_FLUSH and PF_MEMALLOC tasks are allowed to use the reserves, because
+ * they're trying to *free* some memory.
+ *
+ * Requires that conf->device_lock be held.
+ */
+static int may_take_bh(raid1_conf_t *conf, int cnt)
+{
+ int min_free = (current->flags & (PF_FLUSH|PF_MEMALLOC)) ?
+ cnt :
+ (cnt + FREER1_MEMALLOC_RESERVED * conf->raid_disks);
+ return conf->freebh_cnt >= min_free;
+}
+
 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
 {
         /* return a linked list of "cnt" struct buffer_heads.
@@ -62,7 +84,7 @@ static struct buffer_head *raid1_alloc_b
         while(cnt) {
                 struct buffer_head *t;
                 md_spin_lock_irq(&conf->device_lock);
- if (conf->freebh_cnt >= cnt)
+ if (may_take_bh(conf, cnt))
                         while (cnt) {
                                 t = conf->freebh;
                                 conf->freebh = t->b_next;
@@ -83,7 +105,7 @@ static struct buffer_head *raid1_alloc_b
                         cnt--;
                 } else {
                         PRINTK("raid1: waiting for %d bh\n", cnt);
- wait_event(conf->wait_buffer, conf->freebh_cnt >= cnt);
+ wait_event(conf->wait_buffer, may_take_bh(conf, cnt));
                 }
         }
         return bh;
@@ -96,9 +118,9 @@ static inline void raid1_free_bh(raid1_c
         while (bh) {
                 struct buffer_head *t = bh;
                 bh=bh->b_next;
- if (t->b_pprev == NULL)
+ if (conf->freebh_cnt >= FREER1_MEMALLOC_RESERVED) {
                         kfree(t);
- else {
+ } else {
                         t->b_next= conf->freebh;
                         conf->freebh = t;
                         conf->freebh_cnt++;
@@ -108,29 +130,6 @@ static inline void raid1_free_bh(raid1_c
         wake_up(&conf->wait_buffer);
 }
 
-static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
-{
- /* allocate cnt buffer_heads, possibly less if kalloc fails */
- int i = 0;
-
- while (i < cnt) {
- struct buffer_head *bh;
- bh = kmalloc(sizeof(*bh), GFP_KERNEL);
- if (!bh) break;
- memset(bh, 0, sizeof(*bh));
-
- md_spin_lock_irq(&conf->device_lock);
- bh->b_pprev = &conf->freebh;
- bh->b_next = conf->freebh;
- conf->freebh = bh;
- conf->freebh_cnt++;
- md_spin_unlock_irq(&conf->device_lock);
-
- i++;
- }
- return i;
-}
-
 static int raid1_shrink_bh(raid1_conf_t *conf, int cnt)
 {
         /* discard cnt buffer_heads, if we can find them */
@@ -147,7 +146,16 @@ static int raid1_shrink_bh(raid1_conf_t
         md_spin_unlock_irq(&conf->device_lock);
         return i;
 }
-
+
+/*
+ * Return true if the caller make take a raid1_bh from the list.
+ * Requires that conf->device_lock be held.
+ */
+static int may_take_r1bh(raid1_conf_t *conf)
+{
+ return ((conf->freer1_cnt > FREER1_MEMALLOC_RESERVED) ||
+ (current->flags & (PF_FLUSH|PF_MEMALLOC))) && conf->freer1;
+}
 
 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
 {
@@ -155,8 +163,9 @@ static struct raid1_bh *raid1_alloc_r1bh
 
         do {
                 md_spin_lock_irq(&conf->device_lock);
- if (conf->freer1) {
+ if (may_take_r1bh(conf)) {
                         r1_bh = conf->freer1;
+ conf->freer1_cnt--;
                         conf->freer1 = r1_bh->next_r1;
                         r1_bh->next_r1 = NULL;
                         r1_bh->state = 0;
@@ -170,7 +179,7 @@ static struct raid1_bh *raid1_alloc_r1bh
                         memset(r1_bh, 0, sizeof(*r1_bh));
                         return r1_bh;
                 }
- wait_event(conf->wait_buffer, conf->freer1);
+ wait_event(conf->wait_buffer, may_take_r1bh(conf));
         } while (1);
 }
 
@@ -178,49 +187,30 @@ static inline void raid1_free_r1bh(struc
 {
         struct buffer_head *bh = r1_bh->mirror_bh_list;
         raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+ unsigned long flags;
 
         r1_bh->mirror_bh_list = NULL;
 
- if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (conf->freer1_cnt < FREER1_MEMALLOC_RESERVED) {
                 r1_bh->next_r1 = conf->freer1;
                 conf->freer1 = r1_bh;
+ conf->freer1_cnt++;
                 spin_unlock_irqrestore(&conf->device_lock, flags);
         } else {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
                 kfree(r1_bh);
         }
         raid1_free_bh(conf, bh);
 }
 
-static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
-{
- int i = 0;
-
- while (i < cnt) {
- struct raid1_bh *r1_bh;
- r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
- if (!r1_bh)
- break;
- memset(r1_bh, 0, sizeof(*r1_bh));
-
- md_spin_lock_irq(&conf->device_lock);
- set_bit(R1BH_PreAlloc, &r1_bh->state);
- r1_bh->next_r1 = conf->freer1;
- conf->freer1 = r1_bh;
- md_spin_unlock_irq(&conf->device_lock);
-
- i++;
- }
- return i;
-}
-
 static void raid1_shrink_r1bh(raid1_conf_t *conf)
 {
         md_spin_lock_irq(&conf->device_lock);
         while (conf->freer1) {
                 struct raid1_bh *r1_bh = conf->freer1;
                 conf->freer1 = r1_bh->next_r1;
+ conf->freer1_cnt--; /* pedantry */
                 kfree(r1_bh);
         }
         md_spin_unlock_irq(&conf->device_lock);
@@ -1610,21 +1600,6 @@ static int raid1_run (mddev_t *mddev)
                 goto out_free_conf;
         }
 
-
- /* pre-allocate some buffer_head structures.
- * As a minimum, 1 r1bh and raid_disks buffer_heads
- * would probably get us by in tight memory situations,
- * but a few more is probably a good idea.
- * For now, try 16 r1bh and 16*raid_disks bufferheads
- * This will allow at least 16 concurrent reads or writes
- * even if kmalloc starts failing
- */
- if (raid1_grow_r1bh(conf, 16) < 16 ||
- raid1_grow_bh(conf, 16*conf->raid_disks)< 16*conf->raid_disks) {
- printk(MEM_ERROR, mdidx(mddev));
- goto out_free_conf;
- }
-
         for (i = 0; i < MD_SB_DISKS; i++) {
                 
                 descriptor = sb->disks+i;
@@ -1713,6 +1688,8 @@ out_free_conf:
         raid1_shrink_r1bh(conf);
         raid1_shrink_bh(conf, conf->freebh_cnt);
         raid1_shrink_buffers(conf);
+ if (conf->freer1_cnt != 0)
+ BUG();
         kfree(conf);
         mddev->private = NULL;
 out:
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Jul 15 2001 - 21:00:16 EST