Re: [patch] ramdisk blocksize

Andrea Arcangeli (andrea@suse.de)
Sat, 21 Aug 1999 13:13:58 +0200 (CEST)


On Sat, 21 Aug 1999, Andrea Arcangeli wrote:

>This patch will fix the ramdisk but I found some major bug (that can lead
>to data corruption) in the page/buffer cache (not ramdisk related) and I
>think that once I'll have fixed them the ramdisk won't need any change. So
>take the below patch as a temp (wrong) fix.
>
>--- 2.3.15-pre1-ramdisk/drivers/block/rd.c Fri Aug 20 20:58:03 1999
>+++ 2.3.15-pre1-ramdisk-page-cache/drivers/block/rd.c Fri Aug 20 21:31:32 1999
>@@ -217,9 +217,24 @@
> * If we're writing, we protect the buffer.
> */
>
>- if (CURRENT->cmd == READ)
>- memset(CURRENT->buffer, 0, len);
>- else
>+ if (CURRENT->cmd == READ)
>+ {
>+ struct buffer_head * bh, * oldbh;
>+
>+ bh = CURRENT->bh; /* can the req->bh be null? I think no... */
>+ oldbh = get_hash_table(bh->b_rdev, bh->b_blocknr, bh->b_size);
>+ if (!oldbh)
>+ memset(bh->b_data, 0, len);
>+ else
>+ {
>+ memcpy(bh->b_data, oldbh->b_data, bh->b_size);
>+ if (test_and_clear_bit(BH_Protected, &oldbh->b_state))
>+ set_bit(BH_Protected, &bh->b_state);
>+ if (atomic_set_buffer_clean(oldbh))
>+ set_bit(BH_Protected, &bh->b_state);
>+ atomic_dec(&oldbh->b_count);
>+ }
>+ } else
> set_bit(BH_Protected, &CURRENT->bh->b_state);
>
> end_request(1);

Ok, I just have a preliminary patch that try to fix the potential data
corruption that can happens in 2.3.15-pre1 and previous 2.3.x kernels (and
that will automagically fix the ramdisk driver without changing its
internals).

The corruption bug (that has nothing to do with the ramdisk driver) is the
use of truncate_inode_page() to shrink the icache. If an inode is not
in-use and it's hashed in the icache, it can have dirty or protected pages
allocated in its page cache.

So when we shrink the icache so we need to release also all the page-cache
pages that belongs to such inode, we can't simply mark all the
page-cache-overlapped-buffers as clean in flushpage. Otherwise we'll lose
data-writes and this will lead to data corruption on disk.

Previously (in 2.2.x) it was possible to use truncate_inode_pages without
differences (both for shrink the icache and for truncate(2)/unlink), since
the page cache was only there for reads, and both writes and protected
buffers was placed in the buffer cache. This is not possible anymore since
now the pagecache has dirty or protected data in it.

NOTE: with my patch applyed the blockdevice writes are still not
synchronized with filesystem writes (this avoids us having to hash in the
buffer-hashtable the page-cache-overlapped-buffers). So if you read from
the blockdevice layer you shouldn't expect to read the last uptodate data
and if you write to the blockdevice layer your writes can be lost. So just
choose if to use a blockdevice in raw mode or with a filesystem on the top
of it, before start using it ;).

But with the patch applyed it should be guaranteeed that if you unmount a
ramdisk, and _then_ you read the ramdisk from the blockdevice layer,
you'll read the right data (the page-cache will be correctly converted to
regular buffers and not to orphaned-lost buffers).

Please test the ramdisk driver heavily with only this patch applyed
against 2.3.15-pre1. Try to generate ramdisk images using the ramdisk
itself. Make sure to always unmount before accessing the ramdisk via
/dev/ram*. If you get in troubles give me a way to reproduce please ;).

Thanks.

diff -urN 2.3.15-pre1-ramdisk/fs/buffer.c 2.3.15-pre1-ramdisk-page-cache1/fs/buffer.c
--- 2.3.15-pre1-ramdisk/fs/buffer.c Thu Aug 12 02:53:23 1999
+++ 2.3.15-pre1-ramdisk-page-cache1/fs/buffer.c Sat Aug 21 12:56:19 1999
@@ -1232,6 +1232,34 @@
return 0;
}

+static void hash_orphaned_buffers(struct page * page)
+{
+ struct buffer_head * bh = page->buffers, * first = bh;
+
+ do {
+ struct buffer_head ** head;
+ if (buffer_mapped(bh))
+ {
+ head = &hash(bh->b_dev, bh->b_blocknr);
+ write_lock(&hash_table_lock);
+ __hash_link(bh, head);
+ write_unlock(&hash_table_lock);
+ }
+ bh = bh->b_this_page;
+ } while (bh != first);
+}
+
+static void trash_buffer(struct buffer_head * bh)
+{
+ wait_on_buffer(bh);
+ mark_buffer_clean(bh);
+ clear_bit(BH_Uptodate, &bh->b_state);
+ clear_bit(BH_Mapped, &bh->b_state);
+ clear_bit(BH_Req, &bh->b_state);
+ clear_bit(BH_Protected, &bh->b_state);
+ bh->b_blocknr = -1;
+}
+
/*
* We don't have to release all buffers here, but
* we have to be sure that no dirty buffer is left
@@ -1239,7 +1267,7 @@
* we have truncated the file and are going to free the
* blocks on-disk..
*/
-int block_flushpage(struct inode *inode, struct page *page, unsigned long offset)
+int block_flushpage(struct inode *inode, struct page *page, unsigned long offset, int forget)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
@@ -1260,16 +1288,14 @@
*/
if (offset <= curr_off) {
if (buffer_mapped(bh)) {
- atomic_inc(&bh->b_count);
- wait_on_buffer(bh);
if (bh->b_dev == B_FREE)
BUG();
- mark_buffer_clean(bh);
- clear_bit(BH_Uptodate, &bh->b_state);
- clear_bit(BH_Mapped, &bh->b_state);
- clear_bit(BH_Req, &bh->b_state);
- bh->b_blocknr = 0;
- atomic_dec(&bh->b_count);
+ if (forget)
+ /* the page can't be freed while we are
+ working on it since we hold the
+ page-lock and so shrink_mmap
+ won't race with us. */
+ trash_buffer(bh);
}
}
curr_off = next_off;
@@ -1288,7 +1314,13 @@
*/
if (!offset) {
if (!try_to_free_buffers(page))
+ {
atomic_add(PAGE_CACHE_SIZE, &buffermem);
+ if (!forget)
+ /* this must become a regular buffer so
+ we'll find it then. */
+ hash_orphaned_buffers(page);
+ }
}

return 0;
@@ -1455,11 +1487,33 @@
if (buffer_new(bh)) {
memset(bh->b_data, 0, bh->b_size);
} else {
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- err = -EIO;
- if (!buffer_uptodate(bh))
- goto out;
+ struct buffer_head * bh_old;
+
+ bh_old = get_hash_table(bh->b_dev,
+ bh->b_blocknr,
+ bh->b_size);
+ if (!bh_old)
+ {
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ err = -EIO;
+ if (!buffer_uptodate(bh))
+ goto out;
+ }
+ else
+ {
+ memcpy(bh->b_data, bh_old->b_data,
+ bh->b_size);
+ /* move the protected bit from
+ the old buffer to the new buffer. */
+ if (atomic_expose_buffer(bh_old))
+ atomic_protect_buffer(bh);
+
+ /* now trash the old regular buffer */
+ trash_buffer(bh_old);
+ __remove_from_queues(bh_old);
+ atomic_dec(&bh_old->b_count);
+ }
}
}

@@ -1617,11 +1671,33 @@
if (buffer_new(bh)) {
memset(bh->b_data, 0, bh->b_size);
} else {
- ll_rw_block(READ, 1, &bh);
- wait_on_buffer(bh);
- err = -EIO;
- if (!buffer_uptodate(bh))
- goto out;
+ struct buffer_head * bh_old;
+
+ bh_old = get_hash_table(bh->b_dev,
+ bh->b_blocknr,
+ bh->b_size);
+ if (!bh_old)
+ {
+ ll_rw_block(READ, 1, &bh);
+ wait_on_buffer(bh);
+ err = -EIO;
+ if (!buffer_uptodate(bh))
+ goto out;
+ }
+ else
+ {
+ memcpy(bh->b_data, bh_old->b_data,
+ bh->b_size);
+ /* move the protected bit from
+ the old buffer to the new buffer. */
+ if (atomic_expose_buffer(bh_old))
+ atomic_protect_buffer(bh);
+
+ /* now trash the old regular buffer */
+ trash_buffer(bh_old);
+ __remove_from_queues(bh_old);
+ atomic_dec(&bh_old->b_count);
+ }
}
}

@@ -2036,7 +2112,30 @@
memset(bh->b_data, 0, blocksize);
set_bit(BH_Uptodate, &bh->b_state);
continue;
- }
+ } else {
+ struct buffer_head * bh_old;
+
+ bh_old = get_hash_table(bh->b_dev,
+ bh->b_blocknr,
+ bh->b_size);
+ if (bh_old)
+ {
+ memcpy(bh->b_data, bh_old->b_data,
+ bh->b_size);
+ /* move the protected bit from
+ the old buffer to the new buffer. */
+ if (atomic_expose_buffer(bh_old))
+ atomic_protect_buffer(bh);
+ if (atomic_set_buffer_clean(bh_old))
+ mark_buffer_dirty(bh, 0);
+
+ /* now trash the old regular buffer */
+ trash_buffer(bh_old);
+ __remove_from_queues(bh_old);
+ atomic_dec(&bh_old->b_count);
+ continue;
+ }
+ }
}

init_buffer(bh, end_buffer_io_async, NULL);
diff -urN 2.3.15-pre1-ramdisk/fs/inode.c 2.3.15-pre1-ramdisk-page-cache1/fs/inode.c
--- 2.3.15-pre1-ramdisk/fs/inode.c Tue Jul 13 02:02:40 1999
+++ 2.3.15-pre1-ramdisk-page-cache1/fs/inode.c Fri Aug 20 22:05:46 1999
@@ -263,7 +263,7 @@
break;
inode = list_entry(tmp, struct inode, i_list);
if (inode->i_nrpages)
- truncate_inode_pages(inode, 0);
+ truncate_inode_pages(inode, 0, 0);
clear_inode(inode);
count++;
}
@@ -739,7 +739,7 @@
void (*delete)(struct inode *) = op->delete_inode;
spin_unlock(&inode_lock);
if (inode->i_nrpages)
- truncate_inode_pages(inode, 0);
+ truncate_inode_pages(inode, 0, 1);
delete(inode);
spin_lock(&inode_lock);
}
diff -urN 2.3.15-pre1-ramdisk/include/linux/fs.h 2.3.15-pre1-ramdisk-page-cache1/include/linux/fs.h
--- 2.3.15-pre1-ramdisk/include/linux/fs.h Fri Aug 20 21:00:08 1999
+++ 2.3.15-pre1-ramdisk-page-cache1/include/linux/fs.h Fri Aug 20 22:30:02 1999
@@ -627,7 +627,7 @@

int (*readpage) (struct file *, struct page *);
int (*writepage) (struct file *, struct page *);
- int (*flushpage) (struct inode *, struct page *, unsigned long);
+ int (*flushpage) (struct inode *, struct page *, unsigned long, int);

void (*truncate) (struct inode *);
int (*permission) (struct inode *, int);
@@ -780,6 +780,9 @@
clear_bit(BH_Uptodate, &bh->b_state);
}

+#define atomic_expose_buffer(bh) test_and_clear_bit(BH_Protected, &(bh)->b_state)
+#define atomic_protect_buffer(bh) test_and_set_bit(BH_Protected, &(bh)->b_state)
+
#define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state)

extern inline void __mark_buffer_clean(struct buffer_head *bh)
@@ -899,7 +902,7 @@
extern int block_write_full_page (struct file *, struct page *);
extern int block_write_partial_page (struct file *, struct page *, unsigned long, unsigned long, const char *);
extern int block_write_cont_page (struct file *, struct page *, unsigned long, unsigned long, const char *);
-extern int block_flushpage(struct inode *, struct page *, unsigned long);
+extern int block_flushpage(struct inode *, struct page *, unsigned long, int);

extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
diff -urN 2.3.15-pre1-ramdisk/include/linux/mm.h 2.3.15-pre1-ramdisk-page-cache1/include/linux/mm.h
--- 2.3.15-pre1-ramdisk/include/linux/mm.h Fri Aug 20 21:00:08 1999
+++ 2.3.15-pre1-ramdisk-page-cache1/include/linux/mm.h Sat Aug 21 12:31:27 1999
@@ -344,7 +344,7 @@
extern void remove_inode_page(struct page *);
extern unsigned long page_unuse(struct page *);
extern int shrink_mmap(int, int);
-extern void truncate_inode_pages(struct inode *, unsigned long);
+extern void truncate_inode_pages(struct inode *, unsigned long, int);
extern unsigned long get_cached_page(struct inode *, unsigned long, int);
extern void put_cached_page(unsigned long);

diff -urN 2.3.15-pre1-ramdisk/mm/filemap.c 2.3.15-pre1-ramdisk-page-cache1/mm/filemap.c
--- 2.3.15-pre1-ramdisk/mm/filemap.c Fri Aug 20 17:43:24 1999
+++ 2.3.15-pre1-ramdisk-page-cache1/mm/filemap.c Fri Aug 20 22:07:01 1999
@@ -132,7 +132,8 @@
* Truncate the page cache at a set offset, removing the pages
* that are beyond that offset (and zeroing out partial pages).
*/
-void truncate_inode_pages(struct inode * inode, unsigned long start)
+void truncate_inode_pages(struct inode * inode, unsigned long start,
+ int forget)
{
struct page ** p;
struct page * page;
@@ -152,7 +153,7 @@
lock_page(page);

if (inode->i_op->flushpage)
- inode->i_op->flushpage(inode, page, 0);
+ inode->i_op->flushpage(inode, page, 0, forget);

/*
* We remove the page from the page cache
@@ -199,7 +200,7 @@
flush_page_to_ram(address);

if (inode->i_op->flushpage)
- inode->i_op->flushpage(inode, page, offset);
+ inode->i_op->flushpage(inode, page, offset, forget);
/*
* we have dropped the spinlock so we have to
* restart.
diff -urN 2.3.15-pre1-ramdisk/mm/memory.c 2.3.15-pre1-ramdisk-page-cache1/mm/memory.c
--- 2.3.15-pre1-ramdisk/mm/memory.c Fri Aug 20 17:43:24 1999
+++ 2.3.15-pre1-ramdisk-page-cache1/mm/memory.c Sat Aug 21 12:31:59 1999
@@ -890,7 +890,7 @@
{
struct vm_area_struct * mpnt;

- truncate_inode_pages(inode, offset);
+ truncate_inode_pages(inode, offset, 1);
spin_lock(&inode->i_shared_lock);
if (!inode->i_mmap)
goto out_unlock;
diff -urN 2.3.15-pre1-ramdisk/mm/swap_state.c 2.3.15-pre1-ramdisk-page-cache1/mm/swap_state.c
--- 2.3.15-pre1-ramdisk/mm/swap_state.c Thu Aug 12 02:53:25 1999
+++ 2.3.15-pre1-ramdisk-page-cache1/mm/swap_state.c Fri Aug 20 22:07:21 1999
@@ -215,7 +215,7 @@
#endif
PageClearSwapCache(page);
if (inode->i_op->flushpage)
- inode->i_op->flushpage(inode, page, 0);
+ inode->i_op->flushpage(inode, page, 0, 1);
remove_inode_page(page);
}

Andrea

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/