[PATCH] fs: relax count limitation in rw_verify_area, TAKE 2

From: Edward Shishkin
Date: Fri Dec 10 2010 - 15:37:26 EST


Relax synthetic limitation introduced by rw_verify_area().

We limit @count to something that fits in ssize_t instead of
int, so that the kernel now permits single reads and writes of
up to 2^63 bytes on 64-bit systems (whereas it was previously
limited to 2^31), because:

1. This is more conformable to man pages, where @count
should be of size_t (but not more than SSIZE_MAX for
predictable results).

2. Old limitation restricts size of atomic writes that
can be performed by a local file system: 2G can be
not enough in the near future.

3. Some applications of our users don't work with the
old limitation (and it is really hard to fix them).

The following subsystems were tested with this patch applied:

direct-io,
ntfs,
squashfs,
cifs,
ecryptfs,
ext[2,3,4]
hfs,
hfsplus,
reiserfs,
xfs,
jfs,
nfs,
gfs2,
btrfs,
isofs

Everything works fine. While testing the new relaxed
limitation there were found and fixed truncation bugs
in direct-io and ecryptfs. The fixups are in upstream
already.

Signed-off-by: Edward Shishkin <edward.shishkin@xxxxxxxxx>
---
fs/read_write.c | 16 ++++++----------
fs/splice.c | 4 ++--
include/linux/fs.h | 4 ++--
3 files changed, 10 insertions(+), 14 deletions(-)

--- linux-2.6.37-rc4.orig/fs/read_write.c
+++ linux-2.6.37-rc4/fs/read_write.c
@@ -236,21 +236,19 @@ bad:
}
#endif

-
/*
- * rw_verify_area doesn't like huge counts. We limit
- * them to something that fits in "int" so that others
- * won't have to do range checks all the time.
+ * We limit huge counts to something that fits in "ssize_t"
*/
-int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
+ssize_t rw_verify_area(int read_write, struct file *file, loff_t *ppos,
+ size_t count)
{
struct inode *inode;
loff_t pos;
int retval = -EINVAL;

inode = file->f_path.dentry->d_inode;
- if (unlikely((ssize_t) count < 0))
- return retval;
+ if (unlikely(count > MAX_RW_COUNT))
+ count = MAX_RW_COUNT;
pos = *ppos;
if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) {
retval = __negative_fpos_check(file, pos, count);
@@ -267,9 +265,7 @@ int rw_verify_area(int read_write, struc
}
retval = security_file_permission(file,
read_write == READ ? MAY_READ : MAY_WRITE);
- if (retval)
- return retval;
- return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
+ return retval ? retval : count;
}

static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
--- linux-2.6.37-rc4.orig/fs/splice.c
+++ linux-2.6.37-rc4/fs/splice.c
@@ -1097,7 +1097,7 @@ static long do_splice_from(struct pipe_i
{
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
loff_t *, size_t, unsigned int);
- int ret;
+ ssize_t ret;

if (unlikely(!(out->f_mode & FMODE_WRITE)))
return -EBADF;
@@ -1126,7 +1126,7 @@ static long do_splice_to(struct file *in
{
ssize_t (*splice_read)(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int);
- int ret;
+ ssize_t ret;

if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
--- linux-2.6.37-rc4.orig/include/linux/fs.h
+++ linux-2.6.37-rc4/include/linux/fs.h
@@ -1866,8 +1866,8 @@ extern int current_umask(void);
/* /sys/fs */
extern struct kobject *fs_kobj;

-#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
-extern int rw_verify_area(int, struct file *, loff_t *, size_t);
+#define MAX_RW_COUNT ((~(size_t)0) >> 1 & PAGE_CACHE_MASK)
+extern ssize_t rw_verify_area(int, struct file *, loff_t *, size_t);

#define FLOCK_VERIFY_READ 1
#define FLOCK_VERIFY_WRITE 2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/