[patch] FS (ext2) corruption generated by BLKFLSBUF/invalidate_buffers

Andrea Arcangeli (andrea@suse.de)
Thu, 25 Nov 1999 19:14:05 +0100 (CET)


Jason and Samuel told me they could reliable reproduce fs corruption by
writing to a filesystem while the ioctl(BLKFLSBUF) is running in parallel
on the blockdevice where the fs is mounted.

I understood what was going on: BLKFLSBUF is just an interface for
invalidate_buffers and invalidate_buffers() trashes away _all_ the buffers
beloging to such a blockdevice. _Dirty_ buffers included.

As hdparm is using such ioctl too so a simple:

cp /dev/zero /tmp
hdparm -t /dev/<blockdevice_where_tmp_is_placed>

can just corrupt the hd badly.

Actually the corruption is hided pretty well by both hdparm and the kernel
doing a sync on the device before starting invalidate_buffers.

The only two cases where we want invalidate_buffers() to flush away _also_
dirty buffers are:

o upon detected media change
o releasing fdisk memory

In all other cases where the kernel want to invalidate the buffers, it
should first sync the device (no need to wait I/O completation, so an
sync_buffers(dev, 0) is enough if no filesystem is mounted or
sync_dev(dev) if the filesystem is mounted) and then calling an
invalidate_buffers that won't trash dirty blocks. So if new dirty blocks
are generated under invalidate_buffers (legitimate in cases like
BLKFLSBUF), they won't be dropped.

In general we should _never_ drop dirty buffers. If we need to drop dirty
buffers (excluding the ramdisk case) it means we have some kind of fs
corruption going on.

This is my fix against 2.2.14pre7 (probably it won't apply to previous
kernel for unrelated but trivially fixable rejects).

The fix also includes my longstanding race fixes in
set_blocksize/invalidate_buffers and I also noticed that sync_dev is
generating dirty data via the qutoa code _after_ the last sync_buffers
(and so sync_dev right now can return with dirty data still present if
quota is running).

Please Jason could you confirm this my patch fixes the problem for your
reproducible fs corruption? I only given it a try on a floppy and worked
fine here.

The patch can be downloaded also from here

ftp://ftp.*.kernel.org/pub/linux/kernel/people/andrea/patches/v2.2/2.2.14pre7/buffer-races-2.2.14pre7.gz

Thank you.

diff -urN 2.2.14pre7/drivers/acorn/block/fd1772.c 2.2.14pre7-buf-races/drivers/acorn/block/fd1772.c
--- 2.2.14pre7/drivers/acorn/block/fd1772.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/acorn/block/fd1772.c Thu Nov 25 17:45:06 1999
@@ -1536,7 +1536,7 @@
fd_device[drive] = inode->i_rdev;

if (old_dev && old_dev != inode->i_rdev)
- invalidate_buffers(old_dev);
+ invalidate_buffers(old_dev, 0);

/* Allow ioctls if we have write-permissions even if read-only open */
if (filp->f_mode & 2 || permission(inode, 2) == 0)
diff -urN 2.2.14pre7/drivers/acorn/block/mfmhd.c 2.2.14pre7-buf-races/drivers/acorn/block/mfmhd.c
--- 2.2.14pre7/drivers/acorn/block/mfmhd.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/acorn/block/mfmhd.c Thu Nov 25 17:45:13 1999
@@ -1211,7 +1211,7 @@
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
fsync_dev(dev);
- invalidate_buffers(dev);
+ invalidate_buffers(dev, 0);
return 0;

case BLKRASET:
@@ -1508,7 +1508,7 @@
sync_dev (devi);
if (sb)
invalidate_inodes (sb);
- invalidate_buffers (devi);
+ invalidate_buffers (devi, 0);

mfm_gendisk.part[minor].start_sect = 0;
mfm_gendisk.part[minor].nr_sects = 0;
diff -urN 2.2.14pre7/drivers/ap1000/ap.c 2.2.14pre7-buf-races/drivers/ap1000/ap.c
--- 2.2.14pre7/drivers/ap1000/ap.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/ap1000/ap.c Thu Nov 25 17:45:17 1999
@@ -304,7 +304,7 @@
int i;

for (i = 0 ; i < NUM_APDEVS; i++)
- invalidate_buffers(MKDEV(MAJOR_NR, i));
+ invalidate_buffers(MKDEV(MAJOR_NR, i), 0);

unregister_blkdev( MAJOR_NR, "apblock" );
blk_dev[MAJOR_NR].request_fn = 0;
diff -urN 2.2.14pre7/drivers/ap1000/ddv.c 2.2.14pre7-buf-races/drivers/ap1000/ddv.c
--- 2.2.14pre7/drivers/ap1000/ddv.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/ap1000/ddv.c Thu Nov 25 17:45:25 1999
@@ -786,7 +786,7 @@
kdev_t devi = MKDEV(gdev->major, minor);
sync_dev(devi);
invalidate_inodes(devi);
- invalidate_buffers(devi);
+ invalidate_buffers(devi, 0);
gdev->part[minor].start_sect = 0;
gdev->part[minor].nr_sects = 0;
};
@@ -1003,7 +1003,7 @@
struct gendisk ** gdp;

for (i = 0 ; i < NUM_DDVDEVS; i++)
- invalidate_buffers(MKDEV(MAJOR_NR, i));
+ invalidate_buffers(MKDEV(MAJOR_NR, i), 0);

/* reset the opiu */
OPT_IO(OPIU_OP) = OPIU_RESET;
diff -urN 2.2.14pre7/drivers/block/DAC960.c 2.2.14pre7-buf-races/drivers/block/DAC960.c
--- 2.2.14pre7/drivers/block/DAC960.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/DAC960.c Thu Nov 25 17:45:46 1999
@@ -2547,7 +2547,7 @@
/* Flush Buffers. */
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
fsync_dev(Inode->i_rdev);
- invalidate_buffers(Inode->i_rdev);
+ invalidate_buffers(Inode->i_rdev, 0);
return 0;
case BLKRRPART:
/* Re-Read Partition Table. */
@@ -2572,7 +2572,7 @@
sync_dev(Device);
if (SuperBlock != NULL)
invalidate_inodes(SuperBlock);
- invalidate_buffers(Device);
+ invalidate_buffers(Device, 0);
/*
Clear existing partition sizes.
*/
diff -urN 2.2.14pre7/drivers/block/acsi.c 2.2.14pre7-buf-races/drivers/block/acsi.c
--- 2.2.14pre7/drivers/block/acsi.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/acsi.c Thu Nov 25 17:45:51 1999
@@ -1152,7 +1152,7 @@
if(!capable(CAP_SYS_ADMIN)) return -EACCES;
if(!inode->i_rdev) return -EINVAL;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;

case BLKRRPART: /* Re-read partition tables */
@@ -1916,7 +1916,7 @@
fsync_dev(devp);
if (sb)
invalidate_inodes(sb);
- invalidate_buffers(devp);
+ invalidate_buffers(devp, 0);
gdev->part[start + i].nr_sects = 0;
}
gdev->part[start+i].start_sect = 0;
diff -urN 2.2.14pre7/drivers/block/amiflop.c 2.2.14pre7-buf-races/drivers/block/amiflop.c
--- 2.2.14pre7/drivers/block/amiflop.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/amiflop.c Thu Nov 25 17:45:59 1999
@@ -1536,7 +1536,7 @@
sb = get_super(inode->i_rdev);
if (sb)
invalidate_inodes(sb);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
break;
case FDGETPRM:
memset((void *)&getprm, 0, sizeof (getprm));
@@ -1653,7 +1653,7 @@
restore_flags(flags);

if (old_dev && old_dev != inode->i_rdev)
- invalidate_buffers(old_dev);
+ invalidate_buffers(old_dev, 0);

system=(inode->i_rdev & 4)>>2;
unit[drive].dtype=&data_types[system];
@@ -1681,7 +1681,7 @@
sb = get_super(inode->i_rdev);
if (sb)
invalidate_inodes(sb);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
#endif

if (unit[drive].dirty == 1) {
diff -urN 2.2.14pre7/drivers/block/ataflop.c 2.2.14pre7-buf-races/drivers/block/ataflop.c
--- 2.2.14pre7/drivers/block/ataflop.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/ataflop.c Thu Nov 25 17:46:06 1999
@@ -1635,7 +1635,7 @@
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;
}
if (!IOCTL_ALLOWED)
@@ -1948,7 +1948,7 @@
fd_device[drive] = MINOR(inode->i_rdev);

if (old_dev && old_dev != MINOR(inode->i_rdev))
- invalidate_buffers(MKDEV(FLOPPY_MAJOR, old_dev));
+ invalidate_buffers(MKDEV(FLOPPY_MAJOR, old_dev), 0);

/* Allow ioctls if we have write-permissions even if read-only open */
if (filp->f_mode & 2 || permission (inode, 2) == 0)
diff -urN 2.2.14pre7/drivers/block/cpqarray.c 2.2.14pre7-buf-races/drivers/block/cpqarray.c
--- 2.2.14pre7/drivers/block/cpqarray.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/cpqarray.c Thu Nov 25 17:46:10 1999
@@ -1535,7 +1535,7 @@
struct super_block *sb = get_super(devi);
sync_dev(devi);
if (sb) invalidate_inodes(sb);
- invalidate_buffers(devi);
+ invalidate_buffers(devi, 0);
gdev->part[minor].start_sect = 0;
gdev->part[minor].nr_sects = 0;

diff -urN 2.2.14pre7/drivers/block/floppy.c 2.2.14pre7-buf-races/drivers/block/floppy.c
--- 2.2.14pre7/drivers/block/floppy.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/floppy.c Thu Nov 25 17:46:16 1999
@@ -3446,7 +3446,7 @@
case BLKFLSBUF:
if(!capable(CAP_SYS_ADMIN)) return -EACCES;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;

case BLKGETSIZE:
@@ -3777,7 +3777,7 @@
if (old_dev != -1 && old_dev != MINOR(inode->i_rdev)) {
if (buffer_drive == drive)
buffer_track = -1;
- invalidate_buffers(MKDEV(FLOPPY_MAJOR,old_dev));
+ invalidate_buffers(MKDEV(FLOPPY_MAJOR,old_dev), 0);
}

/* Allow ioctls if we have write-permissions even if read-only open */
diff -urN 2.2.14pre7/drivers/block/hd.c 2.2.14pre7-buf-races/drivers/block/hd.c
--- 2.2.14pre7/drivers/block/hd.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/hd.c Thu Nov 25 17:46:22 1999
@@ -620,7 +620,7 @@
case BLKFLSBUF:
if(!capable(CAP_SYS_ADMIN)) return -EACCES;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;

case BLKRRPART: /* Re-read partition tables */
@@ -851,7 +851,7 @@
sync_dev(devi);
if (sb)
invalidate_inodes(sb);
- invalidate_buffers(devi);
+ invalidate_buffers(devi, 0);
gdev->part[minor].start_sect = 0;
gdev->part[minor].nr_sects = 0;
};
diff -urN 2.2.14pre7/drivers/block/ide-disk.c 2.2.14pre7-buf-races/drivers/block/ide-disk.c
--- 2.2.14pre7/drivers/block/ide-disk.c Sun Nov 21 03:31:30 1999
+++ 2.2.14pre7-buf-races/drivers/block/ide-disk.c Thu Nov 25 17:46:30 1999
@@ -446,7 +446,7 @@
static void idedisk_release (struct inode *inode, struct file *filp, ide_drive_t *drive)
{
if (drive->removable && !drive->usage) {
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
if (drive->doorlocking && ide_wait_cmd(drive, WIN_DOORUNLOCK, 0, 0, 0, NULL))
drive->doorlocking = 0;
}
diff -urN 2.2.14pre7/drivers/block/ide-floppy.c 2.2.14pre7-buf-races/drivers/block/ide-floppy.c
--- 2.2.14pre7/drivers/block/ide-floppy.c Sun Nov 21 03:31:30 1999
+++ 2.2.14pre7-buf-races/drivers/block/ide-floppy.c Thu Nov 25 17:46:33 1999
@@ -1358,7 +1358,7 @@
#endif /* IDEFLOPPY_DEBUG_LOG */

if (!drive->usage) {
- invalidate_buffers (inode->i_rdev);
+ invalidate_buffers (inode->i_rdev, 0);
idefloppy_create_prevent_cmd (&pc, 0);
(void) idefloppy_queue_pc_tail (drive, &pc);
}
diff -urN 2.2.14pre7/drivers/block/ide.c 2.2.14pre7-buf-races/drivers/block/ide.c
--- 2.2.14pre7/drivers/block/ide.c Sun Nov 21 03:31:30 1999
+++ 2.2.14pre7-buf-races/drivers/block/ide.c Thu Nov 25 17:46:26 1999
@@ -2192,7 +2192,7 @@
case BLKFLSBUF:
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;

case BLKGETSIZE: /* Return device size */
diff -urN 2.2.14pre7/drivers/block/loop.c 2.2.14pre7-buf-races/drivers/block/loop.c
--- 2.2.14pre7/drivers/block/loop.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/loop.c Thu Nov 25 17:46:37 1999
@@ -493,7 +493,7 @@
memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
memset(lo->lo_name, 0, LO_NAME_SIZE);
loop_sizes[lo->lo_number] = 0;
- invalidate_buffers(dev);
+ invalidate_buffers(dev, 0);
MOD_DEC_USE_COUNT;
return 0;
}
diff -urN 2.2.14pre7/drivers/block/md.c 2.2.14pre7-buf-races/drivers/block/md.c
--- 2.2.14pre7/drivers/block/md.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/md.c Thu Nov 25 17:46:48 1999
@@ -404,7 +404,7 @@
wait_on_buffer(bh);
bforget(bh);
fsync_dev(dev);
- invalidate_buffers(dev);
+ invalidate_buffers(dev, 0);
} else
printk(KERN_ERR "md: getblk failed for device %s\n", kdevname(dev));
}
@@ -456,7 +456,7 @@

for (i=0; i<md_dev[minor].nb_dev; i++) {
fsync_dev(md_dev[minor].devices[i].dev);
- invalidate_buffers(md_dev[minor].devices[i].dev);
+ invalidate_buffers(md_dev[minor].devices[i].dev, 0);
}

/* Resize devices according to the factor. It is used to align
@@ -519,7 +519,7 @@
* The device won't exist anymore -> flush it now
*/
fsync_dev (inode->i_rdev);
- invalidate_buffers (inode->i_rdev);
+ invalidate_buffers (inode->i_rdev, 0);
if (md_dev[minor].sb) {
md_dev[minor].sb->state |= 1 << MD_SB_CLEAN;
md_update_sb(minor);
@@ -655,7 +655,7 @@

case BLKFLSBUF:
fsync_dev (inode->i_rdev);
- invalidate_buffers (inode->i_rdev);
+ invalidate_buffers (inode->i_rdev, 0);
break;

case BLKRASET:
diff -urN 2.2.14pre7/drivers/block/nbd.c 2.2.14pre7-buf-races/drivers/block/nbd.c
--- 2.2.14pre7/drivers/block/nbd.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/nbd.c Thu Nov 25 17:46:52 1999
@@ -434,7 +434,7 @@
if (dev >= MAX_NBD)
return -ENODEV;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
lo = &nbd_dev[dev];
if (lo->refcnt <= 0)
printk(KERN_ALERT "nbd_release: refcount(%d) <= 0\n", lo->refcnt);
diff -urN 2.2.14pre7/drivers/block/paride/pd.c 2.2.14pre7-buf-races/drivers/block/paride/pd.c
--- 2.2.14pre7/drivers/block/paride/pd.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/paride/pd.c Thu Nov 25 17:45:34 1999
@@ -505,7 +505,7 @@
if(!capable(CAP_SYS_ADMIN)) return -EACCES;
if(!(inode->i_rdev)) return -EINVAL;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;
case BLKRRPART:
if (!capable(CAP_SYS_ADMIN))
@@ -538,7 +538,7 @@
sb = get_super(devp);
if (sb) invalidate_inodes(sb);

- invalidate_buffers(devp);
+ invalidate_buffers(devp, 0);
if (PD.removable) pd_doorlock(unit,IDE_DOORUNLOCK);
}

@@ -588,7 +588,7 @@
sb = get_super(devp);
if (sb) invalidate_inodes(sb);

- invalidate_buffers(devp);
+ invalidate_buffers(devp, 0);
pd_hd[minor].start_sect = 0;
pd_hd[minor].nr_sects = 0;
}
diff -urN 2.2.14pre7/drivers/block/paride/pf.c 2.2.14pre7-buf-races/drivers/block/paride/pf.c
--- 2.2.14pre7/drivers/block/paride/pf.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/paride/pf.c Thu Nov 25 17:45:40 1999
@@ -455,7 +455,7 @@
if(!capable(CAP_SYS_ADMIN)) return -EACCES;
if(!(inode->i_rdev)) return -EINVAL;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;
RO_IOCTLS(inode->i_rdev,arg);
default:
@@ -485,7 +485,7 @@
sb = get_super(devp);
if (sb) invalidate_inodes(sb);

- invalidate_buffers(devp);
+ invalidate_buffers(devp, 0);
if (PF.removable) pf_lock(unit,0);
}

diff -urN 2.2.14pre7/drivers/block/ps2esdi.c 2.2.14pre7-buf-races/drivers/block/ps2esdi.c
--- 2.2.14pre7/drivers/block/ps2esdi.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/ps2esdi.c Thu Nov 25 17:46:58 1999
@@ -1162,7 +1162,7 @@
if (!inode->i_rdev)
return -EINVAL;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;

case BLKRRPART:
@@ -1197,7 +1197,7 @@
sync_dev(devp);
if (sb)
invalidate_inodes(sb);
- invalidate_buffers(devp);
+ invalidate_buffers(devp, 0);
ps2esdi_gendisk.part[start + partition].start_sect = 0;
ps2esdi_gendisk.part[start + partition].nr_sects = 0;
}
diff -urN 2.2.14pre7/drivers/block/raid1.c 2.2.14pre7-buf-races/drivers/block/raid1.c
--- 2.2.14pre7/drivers/block/raid1.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/raid1.c Thu Nov 25 17:47:05 1999
@@ -661,7 +661,7 @@
}
bforget(bh);
fsync_dev(dev);
- invalidate_buffers(dev);
+ invalidate_buffers(dev, 0);
bh = NULL;
}
if (buffer)
@@ -670,7 +670,7 @@
dev = bh->b_dev;
bforget(bh);
fsync_dev(dev);
- invalidate_buffers(dev);
+ invalidate_buffers(dev, 0);
}
return rc;
}
diff -urN 2.2.14pre7/drivers/block/raid5.c 2.2.14pre7-buf-races/drivers/block/raid5.c
--- 2.2.14pre7/drivers/block/raid5.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/raid5.c Thu Nov 25 17:47:10 1999
@@ -1352,7 +1352,7 @@
bh[i] = NULL;
}
fsync_dev(dev);
- invalidate_buffers(dev);
+ invalidate_buffers(dev, 0);
}
free_page((unsigned long) tmp.b_data);
return rc;
diff -urN 2.2.14pre7/drivers/block/rd.c 2.2.14pre7-buf-races/drivers/block/rd.c
--- 2.2.14pre7/drivers/block/rd.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/rd.c Thu Nov 25 17:59:53 1999
@@ -191,7 +191,10 @@
switch (cmd) {
case BLKFLSBUF:
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
- invalidate_buffers(inode->i_rdev);
+ /* special: we want to release the ramdisk memory,
+ it's not like with the other blockdevices where
+ this ioctl only flushes away the buffer cache. */
+ invalidate_buffers(inode->i_rdev, 1);
break;

case BLKGETSIZE: /* Return device size */
@@ -347,7 +350,7 @@
int i;

for (i = 0 ; i < NUM_RAMDISKS; i++)
- invalidate_buffers(MKDEV(MAJOR_NR, i));
+ invalidate_buffers(MKDEV(MAJOR_NR, i), 1);

unregister_blkdev( MAJOR_NR, "ramdisk" );
blk_dev[MAJOR_NR].request_fn = 0;
@@ -550,7 +553,7 @@
if (i && (i % devblocks == 0)) {
printk("done disk #%d.\n", i/devblocks);
rotate = 0;
- invalidate_buffers(device);
+ invalidate_buffers(device, 0);
if (infile.f_op->release)
infile.f_op->release(&inode, &infile);
printk("Please insert disk #%d and press ENTER\n", i/devblocks+1);
@@ -573,7 +576,7 @@
kfree(buf);

successful_load:
- invalidate_buffers(device);
+ invalidate_buffers(device, 0);
ROOT_DEV = MKDEV(MAJOR_NR, unit);

done:
diff -urN 2.2.14pre7/drivers/block/xd.c 2.2.14pre7-buf-races/drivers/block/xd.c
--- 2.2.14pre7/drivers/block/xd.c Sun Oct 31 23:31:25 1999
+++ 2.2.14pre7-buf-races/drivers/block/xd.c Thu Nov 25 17:47:48 1999
@@ -349,7 +349,7 @@
case BLKFLSBUF: /* Return devices size */
if(!capable(CAP_SYS_ADMIN)) return -EACCES;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;
case HDIO_SET_DMA:
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
@@ -416,7 +416,7 @@
sync_dev(devp);
if (sb)
invalidate_inodes(sb);
- invalidate_buffers(devp);
+ invalidate_buffers(devp, 0);
xd_gendisk.part[minor].start_sect = 0;
xd_gendisk.part[minor].nr_sects = 0;
};
@@ -1195,7 +1195,7 @@
kdev_t devp = MKDEV(MAJOR_NR, minor);
start = dev << xd_gendisk.minor_shift;
sync_dev(devp);
- invalidate_buffers(devp);
+ invalidate_buffers(devp, 0);
}
}
xd_done();
diff -urN 2.2.14pre7/drivers/cdrom/aztcd.c 2.2.14pre7-buf-races/drivers/cdrom/aztcd.c
--- 2.2.14pre7/drivers/cdrom/aztcd.c Sun Oct 31 23:31:26 1999
+++ 2.2.14pre7-buf-races/drivers/cdrom/aztcd.c Thu Nov 25 17:48:09 1999
@@ -1595,7 +1595,7 @@
if (!--azt_open_count) {
azt_invalidate_buffers();
sync_dev(inode->i_rdev); /*??? isn't it a read only dev?*/
- invalidate_buffers(inode -> i_rdev);
+ invalidate_buffers(inode -> i_rdev, 0);
aztUnlockDoor();
if (azt_auto_eject)
aztSendCmd(ACMD_EJECT);
diff -urN 2.2.14pre7/drivers/cdrom/cdrom.c 2.2.14pre7-buf-races/drivers/cdrom/cdrom.c
--- 2.2.14pre7/drivers/cdrom/cdrom.c Sun Oct 31 23:31:26 1999
+++ 2.2.14pre7-buf-races/drivers/cdrom/cdrom.c Thu Nov 25 17:47:55 1999
@@ -556,7 +556,7 @@
sync_dev(dev);
sb = get_super(dev);
if (sb) invalidate_inodes(sb);
- invalidate_buffers(dev);
+ invalidate_buffers(dev, 0);
if (opened_for_data && (cdi->options & CDO_AUTO_EJECT) &&
CDROM_CAN(CDC_OPEN_TRAY))
cdo->tray_move(cdi, 1);
diff -urN 2.2.14pre7/drivers/cdrom/gscd.c 2.2.14pre7-buf-races/drivers/cdrom/gscd.c
--- 2.2.14pre7/drivers/cdrom/gscd.c Sun Oct 31 23:31:26 1999
+++ 2.2.14pre7-buf-races/drivers/cdrom/gscd.c Thu Nov 25 17:48:15 1999
@@ -406,7 +406,7 @@

gscd_bn = -1;
sync_dev(inode->i_rdev);
- invalidate_buffers(inode -> i_rdev);
+ invalidate_buffers(inode -> i_rdev, 0);

MOD_DEC_USE_COUNT;
return 0;
diff -urN 2.2.14pre7/drivers/cdrom/optcd.c 2.2.14pre7-buf-races/drivers/cdrom/optcd.c
--- 2.2.14pre7/drivers/cdrom/optcd.c Sun Oct 31 23:31:26 1999
+++ 2.2.14pre7-buf-races/drivers/cdrom/optcd.c Thu Nov 25 17:48:37 1999
@@ -1931,7 +1931,7 @@
toc_uptodate = 0;
opt_invalidate_buffers();
sync_dev(ip -> i_rdev);
- invalidate_buffers(ip -> i_rdev);
+ invalidate_buffers(ip -> i_rdev, 0);
status = exec_cmd(COMUNLOCK); /* Unlock door */
if (status < 0) {
DEBUG((DEBUG_VFS, "exec_cmd COMUNLOCK: %02x", -status));
diff -urN 2.2.14pre7/drivers/cdrom/sbpcd.c 2.2.14pre7-buf-races/drivers/cdrom/sbpcd.c
--- 2.2.14pre7/drivers/cdrom/sbpcd.c Sun Oct 31 23:31:26 1999
+++ 2.2.14pre7-buf-races/drivers/cdrom/sbpcd.c Thu Nov 25 17:48:45 1999
@@ -5381,7 +5381,7 @@
if (--D_S[d].open_count<=0)
{
D_S[d].sbp_first_frame=D_S[d].sbp_last_frame=-1;
- invalidate_buffers(cdi->dev);
+ invalidate_buffers(cdi->dev, 0);
if (D_S[d].audio_state!=audio_playing)
if (D_S[d].f_eject) cc_SpinDown();
D_S[d].diskstate_flags &= ~cd_size_bit;
diff -urN 2.2.14pre7/drivers/cdrom/sjcd.c 2.2.14pre7-buf-races/drivers/cdrom/sjcd.c
--- 2.2.14pre7/drivers/cdrom/sjcd.c Sun Oct 31 23:31:26 1999
+++ 2.2.14pre7-buf-races/drivers/cdrom/sjcd.c Thu Nov 25 17:48:55 1999
@@ -1401,7 +1401,7 @@
if( --sjcd_open_count == 0 ){
sjcd_invalidate_buffers();
sync_dev( inode->i_rdev );
- invalidate_buffers( inode->i_rdev );
+ invalidate_buffers( inode->i_rdev, 0 );
s = sjcd_tray_unlock();
if( s < 0 || !sjcd_status_valid || sjcd_command_failed ){
#if defined( SJCD_DIAGNOSTIC )
diff -urN 2.2.14pre7/drivers/scsi/sd.c 2.2.14pre7-buf-races/drivers/scsi/sd.c
--- 2.2.14pre7/drivers/scsi/sd.c Sun Oct 31 23:31:29 1999
+++ 2.2.14pre7-buf-races/drivers/scsi/sd.c Thu Nov 25 17:49:07 1999
@@ -1695,7 +1695,7 @@
struct super_block *sb = get_super(devi);
sync_dev(devi);
if (sb) invalidate_inodes(sb);
- invalidate_buffers(devi);
+ invalidate_buffers(devi, 0);
sd_gendisks->part[index].start_sect = 0;
sd_gendisks->part[index].nr_sects = 0;
/*
@@ -1750,7 +1750,7 @@
struct super_block *sb = get_super(devi);
sync_dev(devi);
if (sb) invalidate_inodes(sb);
- invalidate_buffers(devi);
+ invalidate_buffers(devi, 0);
sd_gendisks->part[index].start_sect = 0;
sd_gendisks->part[index].nr_sects = 0;
sd_sizes[index] = 0;
diff -urN 2.2.14pre7/drivers/scsi/sd_ioctl.c 2.2.14pre7-buf-races/drivers/scsi/sd_ioctl.c
--- 2.2.14pre7/drivers/scsi/sd_ioctl.c Sun Oct 31 23:31:29 1999
+++ 2.2.14pre7-buf-races/drivers/scsi/sd_ioctl.c Thu Nov 25 17:49:00 1999
@@ -100,7 +100,7 @@
if(!capable(CAP_SYS_ADMIN)) return -EACCES;
if(!(inode->i_rdev)) return -EINVAL;
fsync_dev(inode->i_rdev);
- invalidate_buffers(inode->i_rdev);
+ invalidate_buffers(inode->i_rdev, 0);
return 0;

case BLKRRPART: /* Re-read partition tables */
diff -urN 2.2.14pre7/drivers/scsi/sr.c 2.2.14pre7-buf-races/drivers/scsi/sr.c
--- 2.2.14pre7/drivers/scsi/sr.c Sun Oct 31 23:31:30 1999
+++ 2.2.14pre7-buf-races/drivers/scsi/sr.c Thu Nov 25 17:49:12 1999
@@ -1115,7 +1115,7 @@
* We should be kind to our buffer cache, however.
*/
if (sb) invalidate_inodes(sb);
- invalidate_buffers(devi);
+ invalidate_buffers(devi, 0);

/*
* Reset things back to a sane state so that one can re-load a new
diff -urN 2.2.14pre7/drivers/scsi/sr_ioctl.c 2.2.14pre7-buf-races/drivers/scsi/sr_ioctl.c
--- 2.2.14pre7/drivers/scsi/sr_ioctl.c Sun Oct 31 23:31:30 1999
+++ 2.2.14pre7-buf-races/drivers/scsi/sr_ioctl.c Thu Nov 25 17:49:19 1999
@@ -272,7 +272,7 @@

int sr_reset(struct cdrom_device_info *cdi)
{
- invalidate_buffers(cdi->dev);
+ invalidate_buffers(cdi->dev, 0);
return 0;
}

@@ -887,7 +887,7 @@
if(!(cdi->dev))
return -EINVAL;
fsync_dev(cdi->dev);
- invalidate_buffers(cdi->dev);
+ invalidate_buffers(cdi->dev, 0);
return 0;

default:
diff -urN 2.2.14pre7/fs/buffer.c 2.2.14pre7-buf-races/fs/buffer.c
--- 2.2.14pre7/fs/buffer.c Sun Nov 21 03:31:32 1999
+++ 2.2.14pre7-buf-races/fs/buffer.c Thu Nov 25 18:04:22 1999
@@ -24,6 +24,9 @@
* - RMK
*/

+/* invalidate_buffers/set_blocksize/sync_dev race conditions and
+ fs corruption fixes, 1999, Andrea Arcangeli <andrea@suse.de> */
+
#include <linux/malloc.h>
#include <linux/locks.h>
#include <linux/errno.h>
@@ -263,11 +266,14 @@

void sync_dev(kdev_t dev)
{
- sync_buffers(dev, 0);
sync_supers(dev);
sync_inodes(dev);
- sync_buffers(dev, 0);
DQUOT_SYNC(dev);
+ /* sync all the dirty buffers out to disk only _after_ all the
+ high level layers finished generated buffer dirty data
+ (or we'll return with some buffer still dirty on the blockdevice
+ so breaking the semantics of this call) */
+ sync_buffers(dev, 0);
/*
* FIXME(eric) we need to sync the physical devices here.
* This is because some (scsi) controllers have huge amounts of
@@ -396,31 +402,6 @@
return err;
}

-void invalidate_buffers(kdev_t dev)
-{
- int i;
- int nlist;
- struct buffer_head * bh;
-
- for(nlist = 0; nlist < NR_LIST; nlist++) {
- bh = lru_list[nlist];
- for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bh->b_next_free) {
- if (bh->b_dev != dev)
- continue;
- wait_on_buffer(bh);
- if (bh->b_dev != dev)
- continue;
- if (bh->b_count)
- continue;
- bh->b_flushtime = 0;
- clear_bit(BH_Protected, &bh->b_state);
- clear_bit(BH_Uptodate, &bh->b_state);
- clear_bit(BH_Dirty, &bh->b_state);
- clear_bit(BH_Req, &bh->b_state);
- }
- }
-}
-
/* After several hours of tedious analysis, the following hash
* function won. Do not mess with it... -DaveM
*/
@@ -492,11 +473,14 @@
remove_from_lru_list(bh);
}

-static inline void put_last_free(struct buffer_head * bh)
+static void put_last_free(struct buffer_head * bh)
{
if (bh) {
struct buffer_head **bhp = &free_list[BUFSIZE_INDEX(bh->b_size)];

+ bh->b_count = 0;
+ bh->b_state = 0;
+ remove_from_queues(bh);
bh->b_dev = B_FREE; /* So it is obvious we are on the free list. */

/* Add to back of free list. */
@@ -516,7 +500,7 @@
{
/* put at end of free list */
if(bh->b_dev == B_FREE) {
- put_last_free(bh);
+ panic("B_FREE inserted into queues");
} else {
struct buffer_head **bhp = &lru_list[bh->b_list];

@@ -606,10 +590,62 @@
return 0;
}

+/* If invalidate_buffers() will trash dirty buffers, it means some kind
+ of fs corruption is going on. Trashing dirty data always imply losing
+ information that was supposed to be just stored on the physical layer
+ by the user.
+
+ Thus invalidate_buffers in general usage is not allwowed to trash dirty
+ buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
+
+ NOTE: In the case where the user removed a removable-media-disk even if
+ there's still dirty data not synced on disk (due a bug in the device driver
+ or due an error of the user), by not destroying the dirty buffers we could
+ generate corruption also on the next media inserted, thus a parameter is
+ necessary to handle this case in the most safe way possible (trying
+ to not corrupt also the new disk inserted with the data belonging to
+ the old now corrupted disk). Also for the ramdisk the natural thing
+ to do in order to release the ramdisk memory is to destroy dirty buffers.
+
+ These are two special cases. Normal usage imply the device driver
+ to issue a sync on the device (without waiting I/O completation) and
+ then an invalidate_buffers call that doesn't trashes dirty buffers. */
+void invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
+{
+ int i, nlist, slept;
+ struct buffer_head * bh, * bhnext;
+
+ again:
+ slept = 0;
+ for(nlist = 0; nlist < NR_LIST; nlist++) {
+ bh = lru_list[nlist];
+ if (!bh)
+ continue;
+ for (i = nr_buffers_type[nlist] ; i > 0 ;
+ bh = bhnext, i--)
+ {
+ bhnext = bh->b_next_free;
+ if (bh->b_dev != dev)
+ continue;
+ if (!destroy_dirty_buffers && buffer_dirty(bh))
+ continue;
+ if (buffer_locked(bh))
+ {
+ slept = 1;
+ __wait_on_buffer(bh);
+ }
+ if (!bh->b_count)
+ put_last_free(bh);
+ if (slept)
+ goto again;
+ }
+ }
+}
+
void set_blocksize(kdev_t dev, int size)
{
extern int *blksize_size[];
- int i, nlist;
+ int i, nlist, slept;
struct buffer_head * bh, *bhnext;

if (!blksize_size[MAJOR(dev)])
@@ -631,27 +667,35 @@
/* We need to be quite careful how we do this - we are moving entries
* around on the free list, and we can get in a loop if we are not careful.
*/
- for(nlist = 0; nlist < NR_LIST; nlist++) {
+ again:
+ slept = 0;
+ for(nlist = 0; nlist < NR_LIST; nlist++) {
bh = lru_list[nlist];
- for (i = nr_buffers_type[nlist]*2 ; --i > 0 ; bh = bhnext) {
- if(!bh)
- break;
-
- bhnext = bh->b_next_free;
- if (bh->b_dev != dev)
- continue;
- if (bh->b_size == size)
- continue;
- bhnext->b_count++;
- wait_on_buffer(bh);
- bhnext->b_count--;
- if (bh->b_dev == dev && bh->b_size != size) {
- clear_bit(BH_Dirty, &bh->b_state);
- clear_bit(BH_Uptodate, &bh->b_state);
- clear_bit(BH_Req, &bh->b_state);
- bh->b_flushtime = 0;
+ if (!bh)
+ continue;
+ for (i = nr_buffers_type[nlist] ; i > 0 ;
+ bh = bhnext, i--)
+ {
+ bhnext = bh->b_next_free;
+ if (bh->b_dev != dev || bh->b_size == size)
+ continue;
+ if (buffer_dirty(bh))
+ printk(KERN_ERR "set_blocksize: dev %s buffer_dirty %lu size %lu\n", kdevname(dev), bh->b_blocknr, bh->b_size);
+ if (buffer_locked(bh))
+ {
+ slept = 1;
+ wait_on_buffer(bh);
}
- remove_from_hash_queue(bh);
+ if (!bh->b_count)
+ put_last_free(bh);
+ else
+ printk(KERN_ERR
+ "set_blocksize: "
+ "b_count %d, dev %s, block %lu!\n",
+ bh->b_count, bdevname(bh->b_dev),
+ bh->b_blocknr);
+ if (slept)
+ goto again;
}
}
}
@@ -831,9 +875,6 @@
__brelse(buf);
return;
}
- buf->b_count = 0;
- buf->b_state = 0;
- remove_from_queues(buf);
put_last_free(buf);
}

diff -urN 2.2.14pre7/fs/devices.c 2.2.14pre7-buf-races/fs/devices.c
--- 2.2.14pre7/fs/devices.c Sun Oct 31 23:31:33 1999
+++ 2.2.14pre7-buf-races/fs/devices.c Thu Nov 25 17:51:29 1999
@@ -216,7 +216,8 @@
if (sb && invalidate_inodes(sb))
printk("VFS: busy inodes on changed media.\n");

- invalidate_buffers(dev);
+ /* special: trash all dirty data as well as the media is changed */
+ invalidate_buffers(dev, 1);

if (fops->revalidate)
fops->revalidate(dev);
diff -urN 2.2.14pre7/fs/super.c 2.2.14pre7-buf-races/fs/super.c
--- 2.2.14pre7/fs/super.c Sun Oct 31 23:31:33 1999
+++ 2.2.14pre7-buf-races/fs/super.c Thu Nov 25 17:52:42 1999
@@ -1276,7 +1276,10 @@
umount_error = do_umount(old_root_dev,1, 0);
if (!umount_error) {
printk("okay\n");
- invalidate_buffers(old_root_dev);
+ /* special: the old device driver is going to be
+ a ramdisk and the point of this call is to free its
+ protected memory (even if dirty). */
+ invalidate_buffers(old_root_dev, 1);
return 0;
}
printk(KERN_ERR "error %d\n",umount_error);
diff -urN 2.2.14pre7/include/linux/fs.h 2.2.14pre7-buf-races/include/linux/fs.h
--- 2.2.14pre7/include/linux/fs.h Sun Nov 21 03:31:32 1999
+++ 2.2.14pre7-buf-races/include/linux/fs.h Thu Nov 25 17:53:03 1999
@@ -783,7 +783,7 @@
extern int check_disk_change(kdev_t dev);
extern int invalidate_inodes(struct super_block * sb);
extern void invalidate_inode_pages(struct inode *);
-extern void invalidate_buffers(kdev_t dev);
+extern void invalidate_buffers(kdev_t dev, int);
extern int floppy_is_wp(int minor);
extern void sync_inodes(kdev_t dev);
extern void write_inode_now(struct inode *inode);

Andrea

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/