[PATCH] Handle i_size > s_maxbytes correctly

From: Jan Kara
Date: Thu Dec 20 2007 - 12:51:23 EST


Hi Andrew,

nobody seemed to care about this patch, so could you pull it into -mm so
that it gets broader testing? Thanks.

Honza

--
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR
---

Although we don't allow writes over s_maxbytes, it can happen that a file's
size is larger than s_maxbytes. For example we can write the file from a
computer with a different architecture (which has larger s_maxbytes), boot
a kernel with a different set of config options (CONFIG_LBD...), or if two
nodes in a [Ocfs2, and likely Gfs2] cluster have mounted the same file
system and have different s_maxbytes. Thus we have to make sure we don't
crash / corrupt data when seeing such file (page offset of the last page
needn't fit into pgoff_t). Firstly, we make read() and mmap() return error
when user tries to access the file above s_maxbytes, secondly we introduce
a function i_size_read_trunc() which returns min(i_size, s_maxbytes) and
use it when determining maximal page offset we are interested in.

Signed-off-by: Jan Kara <jack@xxxxxxx>
CC: Mark Fasheh <mark.fasheh@xxxxxxxxxx>

diff --git a/fs/buffer.c b/fs/buffer.c
index 7249e01..3861118 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1623,7 +1623,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,

BUG_ON(!PageLocked(page));

- last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+ last_block = (i_size_read_trunc(inode) - 1) >> inode->i_blkbits;

if (!page_has_buffers(page)) {
create_empty_buffers(page, blocksize,
@@ -2084,7 +2084,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
head = page_buffers(page);

iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+ lblock = (i_size_read_trunc(inode)+blocksize-1) >> inode->i_blkbits;
bh = head;
nr = 0;
i = 0;
@@ -2347,7 +2347,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
int ret = -EINVAL;

lock_page(page);
- size = i_size_read(inode);
+ size = i_size_read_trunc(inode);
if ((page->mapping != inode->i_mapping) ||
(page_offset(page) > size)) {
/* page got truncated out from underneath us */
@@ -2603,7 +2603,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
struct inode * const inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
+ loff_t i_size = i_size_read_trunc(inode);
const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset;
int ret;
@@ -2803,7 +2803,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
struct writeback_control *wbc)
{
struct inode * const inode = page->mapping->host;
- loff_t i_size = i_size_read(inode);
+ loff_t i_size = i_size_read_trunc(inode);
const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
unsigned offset;

diff --git a/fs/direct-io.c b/fs/direct-io.c
index acf0da1..8223868 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -525,8 +525,8 @@ static int get_more_blocks(struct dio *dio)

create = dio->rw & WRITE;
if (dio->lock_type == DIO_LOCKING) {
- if (dio->block_in_file < (i_size_read(dio->inode) >>
- dio->blkbits))
+ if (dio->block_in_file < (i_size_read_trunc(dio->inode)
+ >> dio->blkbits))
create = 0;
} else if (dio->lock_type == DIO_NO_LOCKING) {
create = 0;
@@ -870,7 +870,8 @@ do_holes:
* Be sure to account for a partial block as the
* last block in the file
*/
- i_size_aligned = ALIGN(i_size_read(dio->inode),
+ i_size_aligned =
+ ALIGN(i_size_read_trunc(dio->inode),
1 << blkbits);
if (dio->block_in_file >=
i_size_aligned >> blkbits) {
@@ -961,7 +962,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
dio->next_block_for_io = -1;

dio->iocb = iocb;
- dio->i_size = i_size_read(inode);
+ dio->i_size = i_size_read_trunc(inode);

spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
diff --git a/fs/mpage.c b/fs/mpage.c
index d54f8f8..c666089 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -190,7 +190,8 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,

block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (i_size_read_trunc(inode) + blocksize - 1) >>
+ blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
@@ -468,7 +469,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct block_device *boundary_bdev = NULL;
int length;
struct buffer_head map_bh;
- loff_t i_size = i_size_read(inode);
+ loff_t i_size = i_size_read_trunc(inode);
int ret = 0;

if (page_has_buffers(page)) {
diff --git a/fs/read_write.c b/fs/read_write.c
index ea1f94c..ed91acc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <linux/mount.h>
#include "read_write.h"

#include <asm/uaccess.h>
@@ -263,6 +264,11 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
+ if (unlikely(*pos + count > file->f_vfsmnt->mnt_sb->s_maxbytes)) {
+ if (*pos >= file->f_vfsmnt->mnt_sb->s_maxbytes)
+ return -EOVERFLOW;
+ count = file->f_vfsmnt->mnt_sb->s_maxbytes - *pos;
+ }

ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b3ec4a4..d24ef6c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1394,6 +1394,16 @@ static inline void inode_dec_link_count(struct inode *inode)
mark_inode_dirty(inode);
}

+/* Truncate i_size at s_maxbytes so that pagecache doesn't have problems */
+static inline loff_t i_size_read_trunc(const struct inode *inode)
+{
+ loff_t i_size = i_size_read(inode);
+
+ if (unlikely(inode->i_sb->s_maxbytes < i_size))
+ return inode->i_sb->s_maxbytes;
+ return i_size;
+}
+
extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
static inline void file_accessed(struct file *file)
{
diff --git a/mm/filemap.c b/mm/filemap.c
index 188cf5f..eb97b9e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -362,7 +362,7 @@ EXPORT_SYMBOL(sync_page_range_nolock);
*/
int filemap_fdatawait(struct address_space *mapping)
{
- loff_t i_size = i_size_read(mapping->host);
+ loff_t i_size = i_size_read_trunc(mapping->host);

if (i_size == 0)
return 0;
@@ -912,7 +912,7 @@ page_ok:
* another truncate extends the file - this is desired though).
*/

- isize = i_size_read(inode);
+ isize = i_size_read_trunc(inode);
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
@@ -1298,7 +1298,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int did_readaround = 0;
int ret = 0;

- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = (i_size_read_trunc(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;

@@ -1373,7 +1374,7 @@ retry_find:
goto page_not_uptodate;

/* Must recheck i_size under page lock */
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = (i_size_read_trunc(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(vmf->pgoff >= size)) {
unlock_page(page);
page_cache_release(page);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 32132f3..df26ef5 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -63,7 +63,7 @@ do_xip_mapping_read(struct address_space *mapping,
index = *ppos >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;

- isize = i_size_read(inode);
+ isize = i_size_read_trunc(inode);
if (!isize)
goto out;

@@ -219,7 +219,7 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)

/* XXX: are VM_FAULT_ codes OK? */

- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size = (i_size_read_trunc(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
return VM_FAULT_SIGBUS;

diff --git a/mm/mmap.c b/mm/mmap.c
index facc1a7..b2ee331 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -980,6 +980,13 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
if (locks_verify_locked(inode))
return -EAGAIN;

+ /*
+ * Make sure we don't map more than fs is able to handle
+ */
+ if ((((loff_t)pgoff) << PAGE_SHIFT) + len >
+ inode->i_sb->s_maxbytes)
+ return -EINVAL;
+
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
diff --git a/mm/readahead.c b/mm/readahead.c
index c9c50ca..8ec8d4a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -131,7 +131,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
LIST_HEAD(page_pool);
int page_idx;
int ret = 0;
- loff_t isize = i_size_read(inode);
+ loff_t isize = i_size_read_trunc(inode);

if (isize == 0)
goto out;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648..88d3bb1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1082,7 +1082,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
*/
probe_block = 0;
page_no = 0;
- last_block = i_size_read(inode) >> blkbits;
+ last_block = i_size_read_trunc(inode) >> blkbits;
while ((probe_block + blocks_per_page) <= last_block &&
page_no < sis->max) {
unsigned block_in_page;
@@ -1517,7 +1517,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
goto bad_swap;
}

- swapfilesize = i_size_read(inode) >> PAGE_SHIFT;
+ swapfilesize = i_size_read_trunc(inode) >> PAGE_SHIFT;

/*
* Read the swap header.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/