Re: [Kiobuf-io-devel] RFC: Kernel mechanism: Compound event wait

From: Neil Brown (neilb@cse.unsw.edu.au)
Date: Wed Feb 07 2001 - 19:34:46 EST

Next message: Zink, Dan: "[Patch] ServerWorks peer bus fix for 2.4.x"
Previous message: Jason Ford: "RE: aacraid 2.4.0 kernel"
Next in thread: Christoph Hellwig: "Re: [Kiobuf-io-devel] RFC: Kernel mechanism: Compound event wait /notify + callback chains"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Wednesday February 7, torvalds@transmeta.com wrote:
>
>
> On Wed, 7 Feb 2001, Christoph Hellwig wrote:
>
> > On Tue, Feb 06, 2001 at 12:59:02PM -0800, Linus Torvalds wrote:
> > >
> > > Actually, they really aren't.
> > >
> > > They kind of _used_ to be, but more and more they've moved away from that
> > > historical use. Check in particular the page cache, and as a really
> > > extreme case the swap cache version of the page cache.
> >
> > Yes. And that exactly why I think it's ugly to have the left-over
> > caching stuff in the same data sctruture as the IO buffer.
>
> I do agree.
>
> I would not be opposed to factoring out the "pure block IO" part from the
> bh struct. It should not even be very hard. You'd do something like
>
> struct block_io {
> .. here is the stuff needed for block IO ..
> };
>
> struct buffer_head {
> struct block_io io;
> .. here is the stuff needed for hashing etc ..
> }
>
> and then you make "generic_make_request()" and everything lower down take
> just the "struct block_io".
>

I was just thinking the same, or a similar thing.
I wanted to do

    struct io_head {
         stuff
    };
    struct buffer_head {
         struct io_head;
         more stuff;
    }

so that, as an unnamed substructure, the content of the struct io_head
would automagically be promoted to appear to be content of
buffer_head.
However I then remembered (when it didn't work) that unnamed
substructures are a feature of the Plan-9 C compiler, not the GNU
Compiler Collection. (Any gcc coders out there think this would be a
good thing to add?
http://plan9.bell-labs.com/sys/doc/compiler.html
)

Anyway, I produced the same result in a rather ugly way with #defines
and modified raid5 to use 32byte block_io structures instead of the
80+ byte buffer_heads, and it ... doesn't quite work :-( it boots
fine, but raid5 dies and the Oops message is a few kilometers away.
Anyway, I think the concept it fine.

Patch is below for your inspection.

It occurs to me that Stephen's desire to pass lots of requests through
make_request all at once isn't a bad idea and could be done by simply
linking the io_heads together with b_reqnext.
This would require:
  1/ all callers of generic_make_request (there are 3) to initialise
     b_reqnext
  2/ all registered make_request_fn functions (there are again 3 I
     think) to cope with following b_reqnext

It shouldn't be too hard to make the elevator code take advantage of
any ordering that it fines in the list.

I don't have a patch which does this.

NeilBrown

--- ./include/linux/fs.h 2001/02/07 22:45:37 1.1
+++ ./include/linux/fs.h 2001/02/07 23:09:05
@@ -207,6 +207,7 @@
#define BH_Protected 6 /* 1 if the buffer is protected */

/*
+ * THIS COMMENT NO-LONGER CORRECT.
  * Try to keep the most commonly used fields in single cache lines (16
  * bytes) to improve performance. This ordering should be
  * particularly beneficial on 32-bit processors.
@@ -217,31 +218,43 @@
  * The second 16 bytes we use for lru buffer scans, as used by
  * sync_buffers() and refill_freelist(). -- sct
  */
+
+/*
+ * io_head is all that is needed by device drivers.
+ */
+#define io_head_fields \
+ unsigned long b_state; /* buffer state bitmap (see above) */ \
+ struct buffer_head *b_reqnext; /* request queue */ \
+ unsigned short b_size; /* block size */ \
+ kdev_t b_rdev; /* Real device */ \
+ unsigned long b_rsector; /* Real buffer location on disk */ \
+ char * b_data; /* pointer to data block (512 byte) */ \
+ void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ \
+ void *b_private; /* reserved for b_end_io */ \
+ struct page *b_page; /* the page this bh is mapped to */ \
+ /* this line intensionally left blank */
+struct io_head {
+ io_head_fields
+};
+
+/* buffer_head adds all the stuff needed by the buffer cache */
struct buffer_head {
- /* First cache line: */
+ io_head_fields
+
         struct buffer_head *b_next; /* Hash queue list */
         unsigned long b_blocknr; /* block number */
- unsigned short b_size; /* block size */
         unsigned short b_list; /* List that this buffer appears */
         kdev_t b_dev; /* device (B_FREE = free) */

         atomic_t b_count; /* users using this block */
- kdev_t b_rdev; /* Real device */
- unsigned long b_state; /* buffer state bitmap (see above) */
         unsigned long b_flushtime; /* Time when (dirty) buffer should be written */

         struct buffer_head *b_next_free;/* lru/free list linkage */
         struct buffer_head *b_prev_free;/* doubly linked list of buffers */
         struct buffer_head *b_this_page;/* circular list of buffers in one page */
- struct buffer_head *b_reqnext; /* request queue */

         struct buffer_head **b_pprev; /* doubly linked list of hash-queue */
- char * b_data; /* pointer to data block (512 byte) */
- struct page *b_page; /* the page this bh is mapped to */
- void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
- void *b_private; /* reserved for b_end_io */

- unsigned long b_rsector; /* Real buffer location on disk */
         wait_queue_head_t b_wait;

         struct inode * b_inode;
--- ./drivers/md/raid5.c 2001/02/06 05:43:31 1.2
+++ ./drivers/md/raid5.c 2001/02/07 23:15:36
@@ -151,18 +151,16 @@

         for (i=0; i<num; i++) {
                 struct page *page;
- bh = kmalloc(sizeof(struct buffer_head), priority);
+ bh = kmalloc(sizeof(struct io_head), priority);
                 if (!bh)
                         return 1;
- memset(bh, 0, sizeof (struct buffer_head));
- init_waitqueue_head(&bh->b_wait);
+ memset(bh, 0, sizeof (struct io_head));
                 page = alloc_page(priority);
                 bh->b_data = page_address(page);
                 if (!bh->b_data) {
                         kfree(bh);
                         return 1;
                 }
- atomic_set(&bh->b_count, 0);
                 bh->b_page = page;
                 sh->bh_cache[i] = bh;

@@ -412,7 +410,7 @@
                         spin_lock_irqsave(&conf->device_lock, flags);
                 }
         } else {
- md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
+ md_error(mddev_to_kdev(conf->mddev), conf->disks[i].dev);
                 clear_bit(BH_Uptodate, &bh->b_state);
         }
         clear_bit(BH_Lock, &bh->b_state);
@@ -440,7 +438,7 @@

         md_spin_lock_irqsave(&conf->device_lock, flags);
         if (!uptodate)
- md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
+ md_error(mddev_to_kdev(conf->mddev), conf->disks[i].dev);
         clear_bit(BH_Lock, &bh->b_state);
         set_bit(STRIPE_HANDLE, &sh->state);
         __release_stripe(conf, sh);
@@ -456,12 +454,10 @@
         unsigned long block = sh->sector / (sh->size >> 9);

         init_buffer(bh, raid5_end_read_request, sh);
- bh->b_dev = conf->disks[i].dev;
         bh->b_blocknr = block;

         bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
         bh->b_size = sh->size;
- bh->b_list = BUF_LOCKED;
         return bh;
}

@@ -1085,15 +1081,14 @@
                         else
                                 bh->b_end_io = raid5_end_write_request;
                         if (conf->disks[i].operational)
- bh->b_dev = conf->disks[i].dev;
+ bh->b_rdev = conf->disks[i].dev;
                         else if (conf->spare && action[i] == WRITE+1)
- bh->b_dev = conf->spare->dev;
+ bh->b_rdev = conf->spare->dev;
                         else skip=1;
                         if (!skip) {
                                 PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
                                 atomic_inc(&sh->count);
- bh->b_rdev = bh->b_dev;
- bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+ bh->b_rsector = sh->sector;
                                 generic_make_request(action[i]-1, bh);
                         } else {
                                 PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
@@ -1502,7 +1497,7 @@
         }

         memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
- conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+ conf->raid_disks * ((sizeof(struct io_head) + PAGE_SIZE))) / 1024;
         if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
                 printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
                 shrink_stripes(conf, conf->max_nr_stripes);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/

Next message: Zink, Dan: "[Patch] ServerWorks peer bus fix for 2.4.x"
Previous message: Jason Ford: "RE: aacraid 2.4.0 kernel"
Next in thread: Christoph Hellwig: "Re: [Kiobuf-io-devel] RFC: Kernel mechanism: Compound event wait /notify + callback chains"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This archive was generated by hypermail 2b29 : Wed Feb 07 2001 - 21:00:28 EST