[CFT][PATCH] long-living cache for block devices

From: Alexander Viro (viro@math.psu.edu)
Date: Fri Nov 09 2001 - 19:17:54 EST


        Folks, patch below fixes the problems with flushing block device
cache too early + closes a bunch of races. Right now it assumes that
->check_media_change() works for all block devices; we can either fix
the drivers to make that assumption 100% correct (i.e. if in doubt -
say that it had been changed) or uncomment truncate_inode_pages() in
blkdev_put() and make it conditional on device (i.e. called only if
->check_media_change() is unreliable). Frankly, I would prefer the
former.

        Logics looks so: upon the final close() we finish all pending
IO and destroy all buffer_heads for device. However, we leave pages
alive. block_device stays alive as long as it has pages in cache.
unregister_blkdev() drops the cache for all devices with that major.
->bd_op handling had been sanitized, so the last remnants of rmmod
races for block devices should be gone now.

        Patch works here. Please, help with testing.

diff -urN S15-pre2/fs/block_dev.c S15-pre2-new/fs/block_dev.c
--- S15-pre2/fs/block_dev.c Tue Nov 6 11:36:38 2001
+++ S15-pre2-new/fs/block_dev.c Fri Nov 9 19:01:50 2001
@@ -221,6 +221,18 @@
 static struct vfsmount *bd_mnt;
 
 /*
+ * drivers.
+ */
+
+static struct {
+ const char *name;
+ struct block_device_operations *bdops;
+ struct list_head bdevs;
+} blkdevs[MAX_BLKDEV];
+
+static LIST_HEAD(killable);
+
+/*
  * bdev cache handling - shamelessly stolen from inode.c
  * We use smaller hashtable, though.
  */
@@ -246,6 +258,8 @@
                 memset(bdev, 0, sizeof(*bdev));
                 sema_init(&bdev->bd_sem, 1);
                 INIT_LIST_HEAD(&bdev->bd_inodes);
+ INIT_LIST_HEAD(&bdev->bd_devs);
+ INIT_LIST_HEAD(&bdev->bd_list);
         }
 }
 
@@ -274,6 +288,8 @@
         err = PTR_ERR(bd_mnt);
         if (IS_ERR(bd_mnt))
                 panic("Cannot create bdev pseudo-fs");
+ for (i = 0; i < MAX_BLKDEV; i++)
+ INIT_LIST_HEAD(&blkdevs[i].bdevs);
 }
 
 /*
@@ -296,6 +312,7 @@
                 if (bdev->bd_dev != dev)
                         continue;
                 atomic_inc(&bdev->bd_count);
+ list_del_init(&bdev->bd_list);
                 return bdev;
         }
         return NULL;
@@ -346,19 +363,46 @@
         inode->i_mapping = &inode->i_data;
 }
 
+static void __bdput(struct block_device *bdev)
+{
+ struct list_head *p;
+ while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
+ __bd_forget(list_entry(p, struct inode, i_devices));
+ }
+ list_del_init(&bdev->bd_hash);
+ list_del_init(&bdev->bd_devs);
+ list_del_init(&bdev->bd_list);
+ spin_unlock(&bdev_lock);
+ iput(bdev->bd_inode);
+ destroy_bdev(bdev);
+}
+
+void shrink_bdcache(void)
+{
+ struct block_device *bdev;
+ struct list_head *p;
+retry:
+ spin_lock(&bdev_lock);
+ list_for_each(p, &killable) {
+ bdev = list_entry(p, struct block_device, bd_list);
+ if (bdev->bd_inode->i_mapping->nrpages == 0) {
+ __bdput(bdev);
+ goto retry;
+ }
+ }
+ spin_unlock(&bdev_lock);
+}
+
 void bdput(struct block_device *bdev)
 {
         if (atomic_dec_and_lock(&bdev->bd_count, &bdev_lock)) {
- struct list_head *p;
                 if (bdev->bd_openers)
                         BUG();
- list_del(&bdev->bd_hash);
- while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
- __bd_forget(list_entry(p, struct inode, i_devices));
- }
- spin_unlock(&bdev_lock);
- iput(bdev->bd_inode);
- destroy_bdev(bdev);
+ list_add(&bdev->bd_list, &killable);
+ if (bdev->bd_inode->i_mapping->nrpages == 0)
+ __bdput(bdev);
+ else
+ spin_unlock(&bdev_lock);
         }
 }
  
@@ -368,6 +412,7 @@
         spin_lock(&bdev_lock);
         if (inode->i_bdev) {
                 atomic_inc(&inode->i_bdev->bd_count);
+ list_del_init(&inode->i_bdev->bd_list);
                 spin_unlock(&bdev_lock);
                 return 0;
         }
@@ -396,11 +441,6 @@
         spin_unlock(&bdev_lock);
 }
 
-static struct {
- const char *name;
- struct block_device_operations *bdops;
-} blkdevs[MAX_BLKDEV];
-
 int get_blkdev_list(char * p)
 {
         int i;
@@ -415,62 +455,118 @@
         return len;
 }
 
-/*
- Return the function table of a device.
- Load the driver if needed.
-*/
-const struct block_device_operations * get_blkfops(unsigned int major)
-{
- const struct block_device_operations *ret = NULL;
-
- /* major 0 is used for non-device mounts */
- if (major && major < MAX_BLKDEV) {
-#ifdef CONFIG_KMOD
- if (!blkdevs[major].bdops) {
- char name[20];
- sprintf(name, "block-major-%d", major);
- request_module(name);
- }
-#endif
- ret = blkdevs[major].bdops;
- }
- return ret;
-}
-
 int register_blkdev(unsigned int major, const char * name, struct block_device_operations *bdops)
 {
+ spin_lock(&bdev_lock);
         if (major == 0) {
                 for (major = MAX_BLKDEV-1; major > 0; major--) {
- if (blkdevs[major].bdops == NULL) {
+ if (!blkdevs[major].name) {
                                 blkdevs[major].name = name;
                                 blkdevs[major].bdops = bdops;
+ spin_unlock(&bdev_lock);
                                 return major;
                         }
                 }
+ spin_unlock(&bdev_lock);
                 return -EBUSY;
         }
- if (major >= MAX_BLKDEV)
+ if (major >= MAX_BLKDEV) {
+ spin_unlock(&bdev_lock);
                 return -EINVAL;
- if (blkdevs[major].bdops && blkdevs[major].bdops != bdops)
+ }
+ if (blkdevs[major].name) {
+ spin_unlock(&bdev_lock);
                 return -EBUSY;
+ }
         blkdevs[major].name = name;
         blkdevs[major].bdops = bdops;
+ spin_unlock(&bdev_lock);
         return 0;
 }
 
 int unregister_blkdev(unsigned int major, const char * name)
 {
+ struct list_head *p;
+ struct block_device *bdev;
         if (major >= MAX_BLKDEV)
                 return -EINVAL;
- if (!blkdevs[major].bdops)
- return -EINVAL;
- if (strcmp(blkdevs[major].name, name))
+ spin_lock(&bdev_lock);
+ if (!blkdevs[major].bdops || strcmp(blkdevs[major].name, name)) {
+ spin_unlock(&bdev_lock);
                 return -EINVAL;
- blkdevs[major].name = NULL;
+ }
         blkdevs[major].bdops = NULL;
+ while ( (p = blkdevs[major].bdevs.next) != &blkdevs[major].bdevs ) {
+ bdev = list_entry(p, struct block_device, bd_devs);
+ atomic_inc(&bdev->bd_count);
+ list_del_init(&bdev->bd_list);
+ list_del_init(p);
+ bdev->bd_op = NULL;
+ spin_unlock(&bdev_lock);
+ truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+ bdput(bdev);
+ spin_lock(&bdev_lock);
+ }
+ blkdevs[major].name = NULL;
+ spin_unlock(&bdev_lock);
         return 0;
 }
 
+static struct block_device_operations *get_bdops(int major)
+{
+ struct block_device_operations *ops = NULL;
+ spin_lock(&bdev_lock);
+ if (major < MAX_BLKDEV) {
+ ops = blkdevs[major].bdops;
+ if (ops && ops->owner && !try_inc_mod_count(ops->owner))
+ ops = NULL;
+ }
+ spin_unlock(&bdev_lock);
+ return ops;
+}
+
+static void put_bdops(struct block_device_operations *ops)
+{
+ if (ops->owner)
+ __MOD_DEC_USE_COUNT(ops->owner);
+}
+
+int bd_activate(struct block_device *bdev)
+{
+ struct block_device_operations *ops;
+ int major = MAJOR(to_kdev_t(bdev->bd_dev));
+ spin_lock(&bdev_lock);
+ ops = bdev->bd_op;
+ if (ops) {
+ if (!ops->owner || try_inc_mod_count(ops->owner)) {
+ spin_unlock(&bdev_lock);
+ return 1;
+ }
+ bdev->bd_op = NULL;
+ list_del_init(&bdev->bd_devs);
+ }
+ spin_unlock(&bdev_lock);
+ ops = get_bdops(major);
+#ifdef CONFIG_KMOD
+ if (!ops && major < MAX_BLKDEV) {
+ char name[20];
+ sprintf(name, "block-major-%d", major);
+ request_module(name);
+ ops = get_bdops(major);
+ }
+#endif
+ if (!ops)
+ return 0;
+ spin_lock(&bdev_lock);
+ if (!bdev->bd_op) {
+ bdev->bd_op = ops;
+ list_add(&bdev->bd_devs, &blkdevs[major].bdevs);
+ } else if (bdev->bd_op != ops)
+ BUG();
+ spin_unlock(&bdev_lock);
+ return 1;
+}
+
 /*
  * This routine checks whether a removable media has been changed,
  * and invalidates all buffer-cache-entries in that case. This
@@ -482,25 +578,15 @@
  */
 int check_disk_change(kdev_t dev)
 {
- int i;
- const struct block_device_operations * bdops = NULL;
+ struct block_device_operations * bdops = get_bdops(MAJOR(dev));
 
- i = MAJOR(dev);
- if (i < MAX_BLKDEV)
- bdops = blkdevs[i].bdops;
- if (bdops == NULL) {
- devfs_handle_t de;
-
- de = devfs_find_handle (NULL, NULL, i, MINOR (dev),
- DEVFS_SPECIAL_BLK, 0);
- if (de) bdops = devfs_get_ops (de);
- }
         if (bdops == NULL)
                 return 0;
- if (bdops->check_media_change == NULL)
- return 0;
- if (!bdops->check_media_change(dev))
+
+ if (!bdops->check_media_change || !bdops->check_media_change(dev)) {
+ put_bdops(bdops);
                 return 0;
+ }
 
         printk(KERN_DEBUG "VFS: Disk change detected on device %s\n",
                 bdevname(dev));
@@ -510,6 +596,7 @@
 
         if (bdops->revalidate)
                 bdops->revalidate(dev);
+ put_bdops(bdops);
         return 1;
 }
 
@@ -528,31 +615,25 @@
 
 static int do_open(struct block_device *bdev, struct inode *inode, struct file *file)
 {
- int ret = -ENXIO;
+ int ret;
         kdev_t dev = to_kdev_t(bdev->bd_dev);
 
+ if (!bd_activate(bdev))
+ return -ENXIO;
+
         down(&bdev->bd_sem);
- lock_kernel();
- if (!bdev->bd_op)
- bdev->bd_op = get_blkfops(MAJOR(dev));
- if (bdev->bd_op) {
- ret = 0;
- if (bdev->bd_op->owner)
- __MOD_INC_USE_COUNT(bdev->bd_op->owner);
- if (bdev->bd_op->open)
- ret = bdev->bd_op->open(inode, file);
- if (!ret) {
- bdev->bd_openers++;
- bdev->bd_inode->i_size = blkdev_size(dev);
- bdev->bd_inode->i_blkbits = blksize_bits(block_size(dev));
- } else {
- if (bdev->bd_op->owner)
- __MOD_DEC_USE_COUNT(bdev->bd_op->owner);
- if (!bdev->bd_openers)
- bdev->bd_op = NULL;
- }
+ ret = 0;
+ if (bdev->bd_op->open) {
+ lock_kernel();
+ ret = bdev->bd_op->open(inode, file);
+ unlock_kernel();
         }
- unlock_kernel();
+ if (!ret) {
+ bdev->bd_openers++;
+ bdev->bd_inode->i_size = blkdev_size(dev);
+ bdev->bd_inode->i_blkbits = blksize_bits(block_size(dev));
+ } else
+ put_bdops(bdev->bd_op);
         up(&bdev->bd_sem);
         if (ret)
                 bdput(bdev);
@@ -579,8 +660,6 @@
 
 int blkdev_open(struct inode * inode, struct file * filp)
 {
- struct block_device *bdev;
-
         /*
          * Preserve backwards compatibility and allow large file access
          * even if userspace doesn't ask for it explicitly. Some mkfs
@@ -590,9 +669,7 @@
         filp->f_flags |= O_LARGEFILE;
 
         bd_acquire(inode);
- bdev = inode->i_bdev;
-
- return do_open(bdev, inode, filp);
+ return do_open(inode->i_bdev, inode, filp);
 }
 
 int blkdev_put(struct block_device *bdev, int kind)
@@ -607,14 +684,14 @@
                 __block_fsync(bd_inode);
         else if (kind == BDEV_FS)
                 fsync_no_super(rdev);
- if (!--bdev->bd_openers)
- kill_bdev(bdev);
+ if (!--bdev->bd_openers) {
+ detach_metadata(bdev->bd_inode->i_mapping);
+ /* XXX: The next is needed if ->check... is broken */
+ /* truncate_inode_pages(bdev->bd_inode->i_mapping, 0); */
+ }
         if (bdev->bd_op->release)
                 ret = bdev->bd_op->release(bd_inode, NULL);
- if (bdev->bd_op->owner)
- __MOD_DEC_USE_COUNT(bdev->bd_op->owner);
- if (!bdev->bd_openers)
- bdev->bd_op = NULL;
+ put_bdops(bdev->bd_op);
         unlock_kernel();
         up(&bdev->bd_sem);
         bdput(bdev);
diff -urN S15-pre2/fs/devfs/base.c S15-pre2-new/fs/devfs/base.c
--- S15-pre2/fs/devfs/base.c Tue Nov 6 11:36:39 2001
+++ S15-pre2-new/fs/devfs/base.c Fri Nov 9 19:01:48 2001
@@ -1296,17 +1296,11 @@
         printk ("%s: devfs_register(): NULL name pointer\n", DEVFS_NAME);
         return NULL;
     }
- if (ops == NULL)
+ if (ops == NULL && !S_ISBLK (mode) )
     {
- if ( S_ISBLK (mode) ) ops = (void *) get_blkfops (major);
- if (ops == NULL)
- {
- printk ("%s: devfs_register(%s): NULL ops pointer\n",
- DEVFS_NAME, name);
- return NULL;
- }
- printk ("%s: devfs_register(%s): NULL ops, got %p from major table\n",
- DEVFS_NAME, name, ops);
+ printk ("%s: devfs_register(%s): NULL ops pointer\n",
+ DEVFS_NAME, name);
+ return NULL;
     }
     if ( S_ISDIR (mode) )
     {
@@ -2000,7 +1994,6 @@
 int devfs_register_blkdev (unsigned int major, const char *name,
                            struct block_device_operations *bdops)
 {
- if (boot_options & OPTION_ONLY) return 0;
     return register_blkdev (major, name, bdops);
 } /* End Function devfs_register_blkdev */
 
@@ -2034,7 +2027,6 @@
 
 int devfs_unregister_blkdev (unsigned int major, const char *name)
 {
- if (boot_options & OPTION_ONLY) return 0;
     return unregister_blkdev (major, name);
 } /* End Function devfs_unregister_blkdev */
 
@@ -2173,24 +2165,17 @@
 static int check_disc_changed (struct devfs_entry *de)
 {
     int tmp;
+ int res;
     kdev_t dev = MKDEV (de->u.fcb.u.device.major, de->u.fcb.u.device.minor);
- struct block_device_operations *bdops = de->u.fcb.ops;
     extern int warn_no_part;
 
     if ( !S_ISBLK (de->mode) ) return 0;
- if (bdops == NULL) return 0;
- if (bdops->check_media_change == NULL) return 0;
- if ( !bdops->check_media_change (dev) ) return 0;
- printk ( KERN_DEBUG "VFS: Disk change detected on device %s\n",
- kdevname (dev) );
- if (invalidate_device(dev, 0))
- printk("VFS: busy inodes on changed media..\n");
     /* Ugly hack to disable messages about unable to read partition table */
     tmp = warn_no_part;
     warn_no_part = 0;
- if (bdops->revalidate) bdops->revalidate (dev);
+ res = check_disk_change(dev);
     warn_no_part = tmp;
- return 1;
+ return res;
 } /* End Function check_disc_changed */
 
 
@@ -2366,12 +2351,8 @@
     {
         inode->i_rdev = MKDEV (de->u.fcb.u.device.major,
                                de->u.fcb.u.device.minor);
- if (bd_acquire(inode) == 0)
- {
- if (!inode->i_bdev->bd_op && de->u.fcb.ops)
- inode->i_bdev->bd_op = de->u.fcb.ops;
- }
- else printk ("%s: get_vfs_inode(%d): no block device from bdget()\n",
+ if (bd_acquire(inode) != 0)
+ printk ("%s: get_vfs_inode(%d): no block device from bdget()\n",
                      DEVFS_NAME, (int) inode->i_ino);
     }
     else if ( S_ISFIFO (de->inode.mode) ) inode->i_fop = &def_fifo_fops;
@@ -2478,10 +2459,7 @@
     df = &de->u.fcb;
     file->private_data = de->info;
     if ( S_ISBLK (inode->i_mode) )
- {
         file->f_op = &def_blk_fops;
- if (df->ops) inode->i_bdev->bd_op = df->ops;
- }
     else file->f_op = fops_get ( (struct file_operations*) df->ops );
     if (file->f_op)
         err = file->f_op->open ? (*file->f_op->open) (inode, file) : 0;
diff -urN S15-pre2/fs/super.c S15-pre2-new/fs/super.c
--- S15-pre2/fs/super.c Tue Nov 6 11:36:39 2001
+++ S15-pre2-new/fs/super.c Fri Nov 9 19:01:48 2001
@@ -535,7 +535,6 @@
 {
         struct inode *inode;
         struct block_device *bdev;
- struct block_device_operations *bdops;
         struct super_block * s;
         struct nameidata nd;
         struct list_head *p;
@@ -559,9 +558,6 @@
                 goto out;
         bd_acquire(inode);
         bdev = inode->i_bdev;
- bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
- if (bdops) bdev->bd_op = bdops;
- /* Done with lookups, semaphore down */
         dev = to_kdev_t(bdev->bd_dev);
         if (!(flags & MS_RDONLY))
                 mode |= FMODE_WRITE;
@@ -994,7 +990,6 @@
         bdev = bdget(kdev_t_to_nr(ROOT_DEV));
         if (!bdev)
                 panic(__FUNCTION__ ": unable to allocate root device");
- bdev->bd_op = devfs_get_ops (handle);
         path_start = devfs_generate_path (handle, path + 5, sizeof (path) - 5);
         mode = FMODE_READ;
         if (!(root_mountflags & MS_RDONLY))
diff -urN S15-pre2/include/linux/fs.h S15-pre2-new/include/linux/fs.h
--- S15-pre2/include/linux/fs.h Fri Nov 9 18:58:20 2001
+++ S15-pre2-new/include/linux/fs.h Fri Nov 9 19:01:49 2001
@@ -422,6 +422,8 @@
         const struct block_device_operations *bd_op;
         struct semaphore bd_sem; /* open/close mutex */
         struct list_head bd_inodes;
+ struct list_head bd_devs;
+ struct list_head bd_list;
 };
 
 struct inode {
@@ -1075,7 +1077,7 @@
 extern int blkdev_put(struct block_device *, int);
 
 /* fs/devices.c */
-extern const struct block_device_operations *get_blkfops(unsigned int);
+extern int bd_activate(struct block_device *);
 extern int register_chrdev(unsigned int, const char *, struct file_operations *);
 extern int unregister_chrdev(unsigned int, const char *);
 extern int chrdev_open(struct inode *, struct file *);
diff -urN S15-pre2/mm/filemap.c S15-pre2-new/mm/filemap.c
--- S15-pre2/mm/filemap.c Fri Nov 9 18:58:21 2001
+++ S15-pre2-new/mm/filemap.c Fri Nov 9 19:01:49 2001
@@ -244,6 +244,61 @@
         page_cache_release(page);
 }
 
+static int detach_list_pages(struct list_head *head)
+{
+ struct list_head *curr;
+ struct page * page;
+ int unlocked = 0;
+
+ restart:
+ curr = head->prev;
+ while (curr != head) {
+ unsigned long offset;
+
+ page = list_entry(curr, struct page, list);
+ offset = page->index;
+
+ /* Is one of the pages to truncate? */
+ if (page->buffers) {
+ page_cache_get(page);
+ if (TryLockPage(page)) {
+ /* Restart on this page */
+ list_del(head);
+ list_add(head, curr);
+ spin_unlock(&pagecache_lock);
+ wait_on_page(page);
+ } else {
+ /* Restart after this page */
+ list_del(head);
+ list_add_tail(head, curr);
+ spin_unlock(&pagecache_lock);
+ try_to_free_buffers(page, 0);
+ UnlockPage(page);
+ }
+ page_cache_release(page);
+ spin_lock(&pagecache_lock);
+ unlocked = 1;
+ goto restart;
+ }
+ curr = curr->prev;
+ }
+ return unlocked;
+}
+
+void detach_metadata(struct address_space *mapping)
+{
+ int unlocked;
+
+ spin_lock(&pagecache_lock);
+ do {
+ unlocked = detach_list_pages(&mapping->clean_pages);
+ unlocked |= detach_list_pages(&mapping->dirty_pages);
+ unlocked |= detach_list_pages(&mapping->locked_pages);
+ } while (unlocked);
+ /* Traversed all three lists without dropping the lock */
+ spin_unlock(&pagecache_lock);
+}
+
 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
 static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
 {
diff -urN S15-pre2/mm/swapfile.c S15-pre2-new/mm/swapfile.c
--- S15-pre2/mm/swapfile.c Tue Nov 6 11:36:41 2001
+++ S15-pre2-new/mm/swapfile.c Fri Nov 9 19:01:48 2001
@@ -916,16 +916,12 @@
 
         if (S_ISBLK(swap_inode->i_mode)) {
                 kdev_t dev = swap_inode->i_rdev;
- struct block_device_operations *bdops;
 
                 p->swap_device = dev;
                 set_blocksize(dev, PAGE_SIZE);
                 
                 bd_acquire(swap_inode);
                 bdev = swap_inode->i_bdev;
- bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
- if (bdops) bdev->bd_op = bdops;
-
                 error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP);
                 if (error)
                         goto bad_swap_2;
diff -urN S15-pre2/mm/vmscan.c S15-pre2-new/mm/vmscan.c
--- S15-pre2/mm/vmscan.c Fri Nov 9 18:58:21 2001
+++ S15-pre2-new/mm/vmscan.c Fri Nov 9 19:01:49 2001
@@ -576,6 +576,7 @@
 
         shrink_dcache_memory(priority, gfp_mask);
         shrink_icache_memory(priority, gfp_mask);
+ shrink_bdcache();
 #ifdef CONFIG_QUOTA
         shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 #endif

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Thu Nov 15 2001 - 21:00:24 EST