Re: balance_dirty_pages broken

From: Andrew Morton (akpm@digeo.com)
Date: Thu Oct 17 2002 - 00:09:41 EST


Doug Ledford wrote:
>
> Actually, I don't know if it's balance_dirty_pages fault or some other
> part of the kernel's fault, but it is broken here. Failure mode is that
> balance_dirty_pages would loop forever. Reason it would loop forever is
> because the ps struct had a negative entry for nr_dirty.

I've seen one other report of this. Something has gone wrong
with the page accounting. Could you please send me your
.config and a description of what filesytems you're using?

err. ramdisk? initrd? If so, does this fix?

 drivers/block/rd.c | 173 +++++++++++++++++++++++------------------------------
 fs/buffer.c | 18 +++--
 kernel/ksyms.c | 1
 3 files changed, 90 insertions(+), 102 deletions(-)

--- 2.5.43/drivers/block/rd.c~rd-cleanup Tue Oct 15 21:10:15 2002
+++ 2.5.43-akpm/drivers/block/rd.c Tue Oct 15 21:12:13 2002
@@ -45,12 +45,14 @@
 #include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <asm/atomic.h>
+#include <linux/highmem.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/buffer_head.h> /* for invalidate_bdev() */
+#include <linux/backing-dev.h>
+
 #include <asm/uaccess.h>
 
 /*
@@ -73,10 +75,7 @@ unsigned long initrd_start, initrd_end;
 int initrd_below_start_ok;
 #endif
 
-/* Various static variables go here. Most are used only in the RAM disk code.
- */
-
-static unsigned long rd_length[NUM_RAMDISKS]; /* Size of RAM disks in bytes */
+static unsigned long rd_length[NUM_RAMDISKS]; /* Size of RAM disks in bytes */
 static struct gendisk *rd_disks[NUM_RAMDISKS];
 static devfs_handle_t devfs_handle;
 static struct block_device *rd_bdev[NUM_RAMDISKS];/* Protected device data */
@@ -87,7 +86,7 @@ static struct block_device *rd_bdev[NUM_
  * architecture-specific setup routine (from the stored boot sector
  * information).
  */
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE; /* Size of the RAM disks */
+
 /*
  * It would be very desiderable to have a soft-blocksize (that in the case
  * of the ramdisk driver is also the hardblocksize ;) of PAGE_SIZE because
@@ -101,26 +100,17 @@ int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
  */
 int rd_blocksize = BLOCK_SIZE; /* blocksize of the RAM disks */
 
+/* Size of the RAM disks */
+int rd_size = (CONFIG_BLK_DEV_RAM_SIZE + (PAGE_CACHE_SIZE >> 10) - 1) &
+ (PAGE_CACHE_MASK >> 10);
+
 /*
  * Copyright (C) 2000 Linus Torvalds.
  * 2000 Transmeta Corp.
  * aops copied from ramfs.
  */
-static int ramdisk_readpage(struct file *file, struct page * page)
-{
- if (!PageUptodate(page)) {
- void *kaddr = kmap_atomic(page, KM_USER0);
 
- memset(kaddr, 0, PAGE_CACHE_SIZE);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- SetPageUptodate(page);
- }
- unlock_page(page);
- return 0;
-}
-
-static int ramdisk_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+static void wipe_page(struct page *page)
 {
         if (!PageUptodate(page)) {
                 void *kaddr = kmap_atomic(page, KM_USER0);
@@ -130,43 +120,26 @@ static int ramdisk_prepare_write(struct
                 kunmap_atomic(kaddr, KM_USER0);
                 SetPageUptodate(page);
         }
- SetPageDirty(page);
- return 0;
 }
 
-static int ramdisk_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+static int
+rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector, int minor)
 {
- return 0;
-}
-
-static struct address_space_operations ramdisk_aops = {
- readpage: ramdisk_readpage,
- writepage: fail_writepage,
- prepare_write: ramdisk_prepare_write,
- commit_write: ramdisk_commit_write,
-};
-
-static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec,
- sector_t sector, int minor)
-{
- struct address_space * mapping;
+ struct address_space *mapping;
         unsigned long index;
         unsigned int vec_offset;
         int offset, size, err;
 
         err = 0;
         mapping = rd_bdev[minor]->bd_inode->i_mapping;
-
         index = sector >> (PAGE_CACHE_SHIFT - 9);
         offset = (sector << 9) & ~PAGE_CACHE_MASK;
         size = vec->bv_len;
         vec_offset = vec->bv_offset;
 
         do {
+ struct page *page;
                 int count;
- struct page * page;
- char * src, * dst;
- int unlock = 0;
 
                 count = PAGE_CACHE_SIZE - offset;
                 if (count > size)
@@ -176,53 +149,39 @@ static int rd_blkdev_pagecache_IO(int rw
                 page = find_get_page(mapping, index);
                 if (!page) {
                         page = grab_cache_page(mapping, index);
- err = -ENOMEM;
- if (!page)
+ if (!page) {
+ err = -ENOMEM;
                                 goto out;
- err = 0;
-
- if (!PageUptodate(page)) {
- void *kaddr = kmap_atomic(page, KM_USER0);
-
- memset(kaddr, 0, PAGE_CACHE_SIZE);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- SetPageUptodate(page);
                         }
-
- unlock = 1;
+ wipe_page(page);
+ set_page_dirty(page);
+ unlock_page(page);
                 }
+ if (page != vec->bv_page || vec_offset != offset) {
+ if (rw == READ) {
+ char *src = kmap_atomic(page, KM_USER0);
+ char *dst = kmap_atomic(vec->bv_page, KM_USER1);
+
+ memcpy(dst + vec_offset, src + offset, count);
+ flush_dcache_page(vec->bv_page);
+ kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(dst, KM_USER1);
+ } else {
+ char *src = kmap_atomic(vec->bv_page, KM_USER0);
+ char *dst = kmap_atomic(page, KM_USER1);
 
- index++;
-
- if (rw == READ) {
- src = kmap(page);
- src += offset;
- dst = kmap(vec->bv_page) + vec_offset;
- } else {
- dst = kmap(page);
- dst += offset;
- src = kmap(vec->bv_page) + vec_offset;
+ memcpy(dst + offset, src + vec_offset, count);
+ flush_dcache_page(page);
+ kunmap_atomic(src, KM_USER0);
+ kunmap_atomic(dst, KM_USER1);
+ }
                 }
+ page_cache_release(page);
                 offset = 0;
                 vec_offset += count;
-
- memcpy(dst, src, count);
-
- kunmap(page);
- kunmap(vec->bv_page);
-
- if (rw == READ) {
- flush_dcache_page(sbh->b_page);
- } else {
- SetPageDirty(page);
- }
- if (unlock)
- unlock_page(page);
- __free_page(page);
+ index++;
         } while (size);
-
- out:
+out:
         return err;
 }
 
@@ -324,7 +283,6 @@ static ssize_t initrd_read(struct file *
         return count;
 }
 
-
 static int initrd_release(struct inode *inode,struct file *file)
 {
         extern void free_initrd_mem(unsigned long, unsigned long);
@@ -343,14 +301,19 @@ static int initrd_release(struct inode *
         return 0;
 }
 
-
 static struct file_operations initrd_fops = {
- read: initrd_read,
- release: initrd_release,
+ .read = initrd_read,
+ .release = initrd_release,
 };
 
 #endif
 
+struct address_space_operations ramdisk_aops;
+
+static struct backing_dev_info rd_backing_dev_info = {
+ .ra_pages = 0, /* No readahead */
+ .memory_backed = 1, /* Does not contribute to dirty memory */
+};
 
 static int rd_open(struct inode * inode, struct file * filp)
 {
@@ -375,22 +338,25 @@ static int rd_open(struct inode * inode,
          * Immunize device against invalidate_buffers() and prune_icache().
          */
         if (rd_bdev[unit] == NULL) {
- rd_bdev[unit] = bdget(kdev_t_to_nr(inode->i_rdev));
- rd_bdev[unit]->bd_openers++;
- rd_bdev[unit]->bd_block_size = rd_blocksize;
- rd_bdev[unit]->bd_inode->i_mapping->a_ops = &ramdisk_aops;
- rd_bdev[unit]->bd_inode->i_size = rd_length[unit];
- rd_bdev[unit]->bd_queue = &blk_dev[MAJOR_NR].request_queue;
- rd_bdev[unit]->bd_disk = get_disk(rd_disks[unit]);
+ struct block_device *b = bdget(kdev_t_to_nr(inode->i_rdev));
+
+ rd_bdev[unit] = b;
+ b->bd_openers++;
+ b->bd_block_size = rd_blocksize;
+ b->bd_inode->i_mapping->a_ops = &ramdisk_aops;
+ b->bd_inode->i_mapping->backing_dev_info = &rd_backing_dev_info;
+ b->bd_inode->i_size = rd_length[unit];
+ b->bd_queue = &blk_dev[MAJOR_NR].request_queue;
+ b->bd_disk = get_disk(rd_disks[unit]);
         }
 
         return 0;
 }
 
 static struct block_device_operations rd_bd_op = {
- owner: THIS_MODULE,
- open: rd_open,
- ioctl: rd_ioctl,
+ .owner = THIS_MODULE,
+ .open = rd_open,
+ .ioctl = rd_ioctl,
 };
 
 /* Before freeing the module, invalidate all of the protected buffers! */
@@ -416,6 +382,19 @@ static void __exit rd_cleanup (void)
         unregister_blkdev( MAJOR_NR, "ramdisk" );
 }
 
+/*
+ * If someone writes a ramdisk page with submit_bh(), we have a dirty page
+ * with clean buffers. try_to_free_buffers() will then propagate the buffer
+ * cleanness up into page-cleaness and the VM will evict the page.
+ *
+ * To stop that happening, the ramdisk address_space has a ->releasepage()
+ * which always fails.
+ */
+static int fail_releasepage(struct page *page, int offset)
+{
+ return 0;
+}
+
 /* This is the registration and initialization section of the RAM disk driver */
 static int __init rd_init (void)
 {
@@ -428,6 +407,9 @@ static int __init rd_init (void)
                        rd_blocksize);
                 rd_blocksize = BLOCK_SIZE;
         }
+ ramdisk_aops = def_blk_aops;
+ ramdisk_aops.writepage = fail_writepage;
+ ramdisk_aops.releasepage = fail_releasepage;
 
 #ifdef CONFIG_BLK_DEV_INITRD
         initrd_disk = alloc_disk(1);
@@ -519,9 +501,8 @@ __setup("ramdisk_blocksize=", ramdisk_bl
 #endif
 
 /* options - modular */
-MODULE_PARM (rd_size, "1i");
+MODULE_PARM(rd_size, "1i");
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 MODULE_PARM (rd_blocksize, "i");
 MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes.");
-
 MODULE_LICENSE("GPL");
--- 2.5.43/fs/buffer.c~rd-cleanup Tue Oct 15 21:10:15 2002
+++ 2.5.43-akpm/fs/buffer.c Tue Oct 15 21:10:15 2002
@@ -388,14 +388,21 @@ __find_get_block_slow(struct block_devic
         head = page_buffers(page);
         bh = head;
         do {
- if (bh->b_blocknr == block) {
+ if (bh->b_blocknr == block && buffer_mapped(bh)) {
                         ret = bh;
                         get_bh(bh);
                         goto out_unlock;
                 }
                 bh = bh->b_this_page;
         } while (bh != head);
- buffer_error();
+ /*
+ * This path can happen if the page had some unmapped buffers, which
+ * will have b_blocknr == -1. When a ramdisk mapping's page was brought
+ * partially uptodate by mkfs and unmap_underlying_metadata searches
+ * for blocks in part of the page which wasn't touched by mkfs.
+ *
+ * buffer_error();
+ */
 out_unlock:
         spin_unlock(&bd_mapping->private_lock);
         page_cache_release(page);
@@ -1463,7 +1470,7 @@ int try_to_release_page(struct page *pag
  * @offset: the index of the truncation point
  *
  * block_invalidatepage() is called when all or part of the page has become
- * invalidatedby a truncate operation.
+ * invalidated by a truncate operation.
  *
  * block_invalidatepage() does not have to release all buffers, but it must
  * ensure that no dirty buffer is left outside @offset and that no I/O
@@ -1617,8 +1624,6 @@ static int __block_write_full_page(struc
         last_block = (inode->i_size - 1) >> inode->i_blkbits;
 
         if (!page_has_buffers(page)) {
- if (S_ISBLK(inode->i_mode))
- buffer_error();
                 if (!PageUptodate(page))
                         buffer_error();
                 create_empty_buffers(page, 1 << inode->i_blkbits,
@@ -2605,7 +2610,8 @@ void free_buffer_head(struct buffer_head
 }
 EXPORT_SYMBOL(free_buffer_head);
 
-static void init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void
+init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
 {
         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
                             SLAB_CTOR_CONSTRUCTOR) {
--- 2.5.43/kernel/ksyms.c~rd-cleanup Tue Oct 15 21:10:15 2002
+++ 2.5.43-akpm/kernel/ksyms.c Tue Oct 15 21:10:15 2002
@@ -138,6 +138,7 @@ EXPORT_SYMBOL(page_address);
 EXPORT_SYMBOL(get_user_pages);
 
 /* filesystem internal functions */
+EXPORT_SYMBOL_GPL(def_blk_aops);
 EXPORT_SYMBOL(def_blk_fops);
 EXPORT_SYMBOL(update_atime);
 EXPORT_SYMBOL(get_fs_type);

.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Wed Oct 23 2002 - 22:00:32 EST