Re: Update of file offset on write() etc. is non-atomic with I/O

From: Linus Torvalds
Date: Mon Mar 03 2014 - 12:37:13 EST


Ok, sorry for the long delay, I was distracted (and hoping that Al
would come up with a patch).

Anyway, attached is the patch I think we should do for this issue. It
is fairly simple:

- it adds a "f_pos_mutex" to the "struct file".

- it adds a new FMODE_ATOMIC_POS flag to the file mode flags to mark
things that need atomic f_pos updates

- it makes the "struct fd" flags be two flags rather than one: the
second flag is for "unlock f_pos_mutex when done"

- it introduces "fd[get,put]_pos()" which gets the f_pos_mutex when required

- it makes read/write/lseek use that.

It's pretty damn straightforward, I think, and is minimally serializing.

Al, comments? Yongzhi Pan, this is pretty much untested, but it's
pretty simple and it does fix your test-case.

Linus

On Thu, Feb 20, 2014 at 9:14 AM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
> Yes, I do think we violate POSIX here because of how we handle f_pos
> (the earlier thread from 2006 you point to talks about the "thread
> safe" part, the point here about the actual wording of "atomic" is a
> separate issue).
>
> Long long ago we used to just pass in the pointer to f_pos directly,
> and then the low-level write would update it all under the inode
> semaphore, and all was good.
>
> And then we ended up having tons of problems with non-regular files
> and drivers accessing f_pos and having nasty races with it because
> they did *not* have any locking (and very fundamentally didn't want
> any), and we broke the serialization of f_pos. We still do the *IO*
> atomically, but yes, the f_pos access itself is outside the lock.
>
> Ho humm.. Al, any ideas of how to fix this?
fs/file_table.c | 1 +
fs/open.c | 4 ++++
fs/read_write.c | 46 ++++++++++++++++++++++++++++++++++++----------
include/linux/file.h | 6 +++---
include/linux/fs.h | 6 +++++-
5 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 5fff9030be34..5b24008ea4f6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -135,6 +135,7 @@ struct file *get_empty_filp(void)
atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
+ mutex_init(&f->f_pos_lock);
eventpoll_init_file(f);
/* f->f_version: 0 */
return f;
diff --git a/fs/open.c b/fs/open.c
index 4b3e1edf2fe4..b9ed8b25c108 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f,
return 0;
}

+ /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+ if (S_ISREG(inode->i_mode))
+ f->f_mode |= FMODE_ATOMIC_POS;
+
f->f_op = fops_get(inode->i_fop);
if (unlikely(WARN_ON(!f->f_op))) {
error = -ENODEV;
diff --git a/fs/read_write.c b/fs/read_write.c
index edc5746a902a..9273ec92f07e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
}
EXPORT_SYMBOL(vfs_llseek);

+/*
+ * We only lock f_pos if we have threads (in which case "need_put"
+ * will be set) or if the file might be shared with another process
+ * (in which case the file count is elevated).
+ */
+static struct fd fdget_pos(int fd)
+{
+ struct fd f = fdget(fd);
+ struct file *file = f.file;
+
+ if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+ if (f.need_put || file_count(file) > 1) {
+ f.need_pos_unlock = 1;
+ mutex_lock(&file->f_pos_lock);
+ }
+ }
+ return f;
+}
+
+static void fdput_pos(struct fd f)
+{
+ if (f.need_pos_unlock)
+ mutex_unlock(&f.file->f_pos_lock);
+ fdput(f);
+}
+
SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
off_t retval;
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
if (!f.file)
return -EBADF;

@@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
- fdput(f);
+ fdput_pos(f);
return retval;
}

@@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
@@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
return ret;
}
@@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
@@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ret = vfs_write(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}

return ret;
@@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev);
SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
@@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}

if (ret > 0)
@@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;

if (f.file) {
@@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}

if (ret > 0)
diff --git a/include/linux/file.h b/include/linux/file.h
index cbacf4faf447..636634a09c92 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -28,7 +28,7 @@ static inline void fput_light(struct file *file, int fput_needed)

struct fd {
struct file *file;
- int need_put;
+ unsigned need_put:1, need_pos_unlock:1;
};

static inline void fdput(struct fd fd)
@@ -44,7 +44,7 @@ static inline struct fd fdget(unsigned int fd)
{
int b;
struct file *f = fget_light(fd, &b);
- return (struct fd){f,b};
+ return (struct fd){f,b,0};
}

extern struct file *fget_raw(unsigned int fd);
@@ -54,7 +54,7 @@ static inline struct fd fdget_raw(unsigned int fd)
{
int b;
struct file *f = fget_raw_light(fd, &b);
- return (struct fd){f,b};
+ return (struct fd){f,b,0};
}

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60829565e552..ebfde04bca06 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH ((__force fmode_t)0x4000)

+/* File needs atomic accesses to f_pos */
+#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)

@@ -780,13 +783,14 @@ struct file {
const struct file_operations *f_op;

/*
- * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR.
+ * Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode;
+ struct mutex f_pos_lock;
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;