Re: [patch] FS (ext2) corruption generated by BLKFLSBUF/invalidate_buffers

Andrea Arcangeli (andrea@suse.de)
Fri, 26 Nov 1999 15:16:41 +0100 (CET)


>On Fri, 26 Nov 1999, Guest section DW wrote:

>>I have an aesthetic comment: instead of giving invalidate_buffers()
>>a second argument, and changing the source in lots of places, why

Just a side note: for me changing lots of sources made no difference as I
had to check all such places anyway ;).

>>don't you introduce a second routine invalidate_all_buffers()
>>that is called two places or so and does the invalidate-also-dirty.

The way I like to cleanup this is to rename invalidate_buffers to
__invalidate_buffers and to use the #define to hide the parameter as I
told you privately. I don't like using additional functions interface or
splitting the source as the #define way seems to just have all the
advantages and none downside.

So I changed my tree this way (incremental with the previous
buffer-races-2.2.14pre7.gz):

diff -urN 2.2.14pre7-buf-races/fs/buffer.c 2.2.14pre7-buf-races2/fs/buffer.c
--- 2.2.14pre7-buf-races/fs/buffer.c Fri Nov 26 14:51:40 1999
+++ 2.2.14pre7-buf-races2/fs/buffer.c Fri Nov 26 14:52:24 1999
@@ -610,7 +610,7 @@
These are two special cases. Normal usage imply the device driver
to issue a sync on the device (without waiting I/O completation) and
then an invalidate_buffers call that doesn't trashes dirty buffers. */
-void invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
{
int i, nlist, slept;
struct buffer_head * bh, * bhnext;
diff -urN 2.2.14pre7-buf-races/include/linux/fs.h 2.2.14pre7-buf-races2/include/linux/fs.h
--- 2.2.14pre7-buf-races/include/linux/fs.h Fri Nov 26 14:51:40 1999
+++ 2.2.14pre7-buf-races2/include/linux/fs.h Fri Nov 26 14:54:39 1999
@@ -783,7 +783,9 @@
extern int check_disk_change(kdev_t dev);
extern int invalidate_inodes(struct super_block * sb);
extern void invalidate_inode_pages(struct inode *);
-extern void invalidate_buffers(kdev_t dev, int);
+#define invalidate_buffers(dev) __invalidate_buffers((dev), 0)
+#define destroy_buffers(dev) __invalidate_buffers((dev), 1)
+extern void __invalidate_buffers(kdev_t dev, int);
extern int floppy_is_wp(int minor);
extern void sync_inodes(kdev_t dev);
extern void write_inode_now(struct inode *inode);

(skipped the not interesting part ;)

So the C code generated is the same after the preprocessor (except an
additional __ ;) and the interface looks cleaner.

This below is a new patch against clean 2.2.14pre7 that obsoletes the
patch I posted yesterday (but that make no difference at all for
production and it produces the same binary image):

diff -urN 2.2.14pre7/drivers/block/rd.c 2.2.14pre7-buf-races2/drivers/block/rd.c
--- 2.2.14pre7/drivers/block/rd.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races2/drivers/block/rd.c Fri Nov 26 14:57:39 1999
@@ -191,7 +191,10 @@
switch (cmd) {
case BLKFLSBUF:
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
- invalidate_buffers(inode->i_rdev);
+ /* special: we want to release the ramdisk memory,
+ it's not like with the other blockdevices where
+ this ioctl only flushes away the buffer cache. */
+ destroy_buffers(inode->i_rdev);
break;

case BLKGETSIZE: /* Return device size */
@@ -347,7 +350,7 @@
int i;

for (i = 0 ; i < NUM_RAMDISKS; i++)
- invalidate_buffers(MKDEV(MAJOR_NR, i));
+ destroy_buffers(MKDEV(MAJOR_NR, i));

unregister_blkdev( MAJOR_NR, "ramdisk" );
blk_dev[MAJOR_NR].request_fn = 0;
diff -urN 2.2.14pre7/fs/buffer.c 2.2.14pre7-buf-races2/fs/buffer.c
--- 2.2.14pre7/fs/buffer.c Sun Nov 21 03:31:32 1999
+++ 2.2.14pre7-buf-races2/fs/buffer.c Fri Nov 26 14:52:24 1999
@@ -24,6 +24,9 @@
* - RMK
*/

+/* invalidate_buffers/set_blocksize/sync_dev race conditions and
+ fs corruption fixes, 1999, Andrea Arcangeli <andrea@suse.de> */
+
#include <linux/malloc.h>
#include <linux/locks.h>
#include <linux/errno.h>
@@ -263,11 +266,14 @@

void sync_dev(kdev_t dev)
{
- sync_buffers(dev, 0);
sync_supers(dev);
sync_inodes(dev);
- sync_buffers(dev, 0);
DQUOT_SYNC(dev);
+ /* sync all the dirty buffers out to disk only _after_ all the
+ high level layers finished generated buffer dirty data
+ (or we'll return with some buffer still dirty on the blockdevice
+ so breaking the semantics of this call) */
+ sync_buffers(dev, 0);
/*
* FIXME(eric) we need to sync the physical devices here.
* This is because some (scsi) controllers have huge amounts of
@@ -396,31 +402,6 @@
return err;
}

-void invalidate_buffers(kdev_t dev)
-{
- int i;
- int nlist;
- struct buffer_head * bh;
-
- for(nlist = 0; nlist < NR_LIST; nlist++) {
- bh = lru_list[nlist];
- for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
- if (bh->b_dev != dev)
- continue;
- wait_on_buffer(bh);
- if (bh->b_dev != dev)
- continue;
- if (bh->b_count)
- continue;
- bh->b_flushtime = 0;
- clear_bit(BH_Protected, &bh->b_state);
- clear_bit(BH_Uptodate, &bh->b_state);
- clear_bit(BH_Dirty, &bh->b_state);
- clear_bit(BH_Req, &bh->b_state);
- }
- }
-}
-
/* After several hours of tedious analysis, the following hash
* function won. Do not mess with it... -DaveM
*/
@@ -492,11 +473,14 @@
remove_from_lru_list(bh);
}

-static inline void put_last_free(struct buffer_head * bh)
+static void put_last_free(struct buffer_head * bh)
{
if (bh) {
struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];

+ bh->b_count = 0;
+ bh->b_state = 0;
+ remove_from_queues(bh);
bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */

/* Add to back of free list. */
@@ -516,7 +500,7 @@
{
/* put at end of free list */
if(bh->b_dev == B_FREE) {
- put_last_free(bh);
+ panic("B_FREE inserted into queues");
} else {
struct buffer_head **bhp = &lru_list[bh->b_list];

@@ -606,10 +590,62 @@
return 0;
}

+/* If invalidate_buffers() will trash dirty buffers, it means some kind
+ of fs corruption is going on. Trashing dirty data always imply losing
+ information that was supposed to be just stored on the physical layer
+ by the user.
+
+ Thus invalidate_buffers in general usage is not allwowed to trash dirty
+ buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
+
+ NOTE: In the case where the user removed a removable-media-disk even if
+ there's still dirty data not synced on disk (due a bug in the device driver
+ or due an error of the user), by not destroying the dirty buffers we could
+ generate corruption also on the next media inserted, thus a parameter is
+ necessary to handle this case in the most safe way possible (trying
+ to not corrupt also the new disk inserted with the data belonging to
+ the old now corrupted disk). Also for the ramdisk the natural thing
+ to do in order to release the ramdisk memory is to destroy dirty buffers.
+
+ These are two special cases. Normal usage imply the device driver
+ to issue a sync on the device (without waiting I/O completation) and
+ then an invalidate_buffers call that doesn't trashes dirty buffers. */
+void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+{
+ int i, nlist, slept;
+ struct buffer_head * bh, * bhnext;
+
+ again:
+ slept = 0;
+ for(nlist = 0; nlist < NR_LIST; nlist++) {
+ bh = lru_list[nlist];
+ if (!bh)
+ continue;
+ for (i = nr_buffers_type[nlist] ; i > 0 ;
+ bh = bhnext, i--)
+ {
+ bhnext = bh->b_next_free;
+ if (bh->b_dev != dev)
+ continue;
+ if (!destroy_dirty_buffers && buffer_dirty(bh))
+ continue;
+ if (buffer_locked(bh))
+ {
+ slept = 1;
+ __wait_on_buffer(bh);
+ }
+ if (!bh->b_count)
+ put_last_free(bh);
+ if (slept)
+ goto again;
+ }
+ }
+}
+
void set_blocksize(kdev_t dev, int size)
{
extern int *blksize_size[];
- int i, nlist;
+ int i, nlist, slept;
struct buffer_head * bh, *bhnext;

if (!blksize_size[MAJOR(dev)])
@@ -631,27 +667,35 @@
/* We need to be quite careful how we do this - we are moving entries
* around on the free list, and we can get in a loop if we are not careful.
*/
- for(nlist = 0; nlist < NR_LIST; nlist++) {
+ again:
+ slept = 0;
+ for(nlist = 0; nlist < NR_LIST; nlist++) {
bh = lru_list[nlist];
- for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
- if(!bh)
- break;
-
- bhnext = bh->b_next_free;
- if (bh->b_dev != dev)
- continue;
- if (bh->b_size == size)
- continue;
- bhnext->b_count++;
- wait_on_buffer(bh);
- bhnext->b_count--;
- if (bh->b_dev == dev && bh->b_size != size) {
- clear_bit(BH_Dirty, &bh->b_state);
- clear_bit(BH_Uptodate, &bh->b_state);
- clear_bit(BH_Req, &bh->b_state);
- bh->b_flushtime = 0;
+ if (!bh)
+ continue;
+ for (i = nr_buffers_type[nlist] ; i > 0 ;
+ bh = bhnext, i--)
+ {
+ bhnext = bh->b_next_free;
+ if (bh->b_dev != dev || bh->b_size == size)
+ continue;
+ if (buffer_dirty(bh))
+ printk(KERN_ERR "set_blocksize: dev %s buffer_dirty %lu size %lu\n", kdevname(dev), bh->b_blocknr, bh->b_size);
+ if (buffer_locked(bh))
+ {
+ slept = 1;
+ wait_on_buffer(bh);
}
- remove_from_hash_queue(bh);
+ if (!bh->b_count)
+ put_last_free(bh);
+ else
+ printk(KERN_ERR
+ "set_blocksize: "
+ "b_count %d, dev %s, block %lu!\n",
+ bh->b_count, bdevname(bh->b_dev),
+ bh->b_blocknr);
+ if (slept)
+ goto again;
}
}
}
@@ -831,9 +875,6 @@
__brelse(buf);
return;
}
- buf->b_count = 0;
- buf->b_state = 0;
- remove_from_queues(buf);
put_last_free(buf);
}

diff -urN 2.2.14pre7/fs/devices.c 2.2.14pre7-buf-races2/fs/devices.c
--- 2.2.14pre7/fs/devices.c Sun Oct 31 23:31:33 1999
+++ 2.2.14pre7-buf-races2/fs/devices.c Fri Nov 26 14:59:19 1999
@@ -216,7 +216,8 @@
if (sb && invalidate_inodes(sb))
printk("VFS: busy inodes on changed media.\n");

- invalidate_buffers(dev);
+ /* special: trash all dirty data as well as the media is changed */
+ destroy_buffers(dev);

if (fops->revalidate)
fops->revalidate(dev);
diff -urN 2.2.14pre7/fs/super.c 2.2.14pre7-buf-races2/fs/super.c
--- 2.2.14pre7/fs/super.c Sun Oct 31 23:31:33 1999
+++ 2.2.14pre7-buf-races2/fs/super.c Fri Nov 26 14:59:28 1999
@@ -1276,7 +1276,10 @@
umount_error = do_umount(old_root_dev,1, 0);
if (!umount_error) {
printk("okay\n");
- invalidate_buffers(old_root_dev);
+ /* special: the old device driver is going to be
+ a ramdisk and the point of this call is to free its
+ protected memory (even if dirty). */
+ destroy_buffers(old_root_dev);
return 0;
}
printk(KERN_ERR "error %d\n",umount_error);
diff -urN 2.2.14pre7/include/linux/fs.h 2.2.14pre7-buf-races2/include/linux/fs.h
--- 2.2.14pre7/include/linux/fs.h Sun Nov 21 03:31:32 1999
+++ 2.2.14pre7-buf-races2/include/linux/fs.h Fri Nov 26 14:54:39 1999
@@ -783,7 +783,9 @@
extern int check_disk_change(kdev_t dev);
extern int invalidate_inodes(struct super_block * sb);
extern void invalidate_inode_pages(struct inode *);
-extern void invalidate_buffers(kdev_t dev);
+#define invalidate_buffers(dev) __invalidate_buffers((dev), 0)
+#define destroy_buffers(dev) __invalidate_buffers((dev), 1)
+extern void __invalidate_buffers(kdev_t dev, int);
extern int floppy_is_wp(int minor);
extern void sync_inodes(kdev_t dev);
extern void write_inode_now(struct inode *inode);
diff -urN 2.2.14pre7/kernel/ksyms.c 2.2.14pre7-buf-races2/kernel/ksyms.c
--- 2.2.14pre7/kernel/ksyms.c Sun Nov 21 03:31:32 1999
+++ 2.2.14pre7-buf-races2/kernel/ksyms.c Fri Nov 26 14:52:41 1999
@@ -148,7 +148,7 @@
EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(put_filp);
EXPORT_SYMBOL(check_disk_change);
-EXPORT_SYMBOL(invalidate_buffers);
+EXPORT_SYMBOL(__invalidate_buffers);
EXPORT_SYMBOL(invalidate_inodes);
EXPORT_SYMBOL(invalidate_inode_pages);
EXPORT_SYMBOL(truncate_inode_pages);

Downloadable also from here:

ftp://ftp.*.kernel.org/pub/linux/kernel/people/andrea/patches/v2.2/2.2.14pre7/buffer-races-2.2.14pre7-2.gz

Thank you for your feedback. Further comments are welcome ;).

Andrea

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/