[PATCH driver-core-next] kernfs: cache atomic_write_len in kernfs_open_file

From: Tejun Heo
Date: Tue Mar 04 2014 - 15:39:07 EST


While implementing atomic_write_len, 4d3773c4bb41 ("kernfs: implement
kernfs_ops->atomic_write_len") moved data copy from userland inside
kernfs_get_active() and kernfs_open_file->mutex so that
kernfs_ops->atomic_write_len can be accessed before copying buffer
from userland; unfortunately, this could lead to locking order
inversion involving mmap_sem if copy_from_user() takes a page fault.

======================================================
[ INFO: possible circular locking dependency detected ]
3.14.0-rc4-next-20140228-sasha-00011-g4077c67-dirty #26 Tainted: G W
-------------------------------------------------------
trinity-c236/10658 is trying to acquire lock:
(&of->mutex#2){+.+.+.}, at: [<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120

but task is already holding lock:
(&mm->mmap_sem){++++++}, at: [<mm/util.c:397>] vm_mmap_pgoff+0x6e/0xe0

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (&mm->mmap_sem){++++++}:
[<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
[<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
[<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
[<mm/memory.c:4188>] might_fault+0x7e/0xb0
[<arch/x86/include/asm/uaccess.h:713 fs/kernfs/file.c:291>] kernfs_fop_write+0xd8/0x190
[<fs/read_write.c:473>] vfs_write+0xe3/0x1d0
[<fs/read_write.c:523 fs/read_write.c:515>] SyS_write+0x5d/0xa0
[<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2

-> #0 (&of->mutex#2){+.+.+.}:
[<kernel/locking/lockdep.c:1840>] check_prev_add+0x13f/0x560
[<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
[<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
[<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
[<kernel/locking/mutex.c:470 kernel/locking/mutex.c:571>] mutex_lock_nested+0x6a/0x510
[<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120
[<mm/mmap.c:1573>] mmap_region+0x310/0x5c0
[<mm/mmap.c:1365>] do_mmap_pgoff+0x385/0x430
[<mm/util.c:399>] vm_mmap_pgoff+0x8f/0xe0
[<mm/mmap.c:1416 mm/mmap.c:1374>] SyS_mmap_pgoff+0x1b0/0x210
[<arch/x86/kernel/sys_x86_64.c:72>] SyS_mmap+0x1d/0x20
[<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2

other info that might help us debug this:

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(&mm->mmap_sem);
lock(&of->mutex#2);
lock(&mm->mmap_sem);
lock(&of->mutex#2);

*** DEADLOCK ***

1 lock held by trinity-c236/10658:
#0: (&mm->mmap_sem){++++++}, at: [<mm/util.c:397>] vm_mmap_pgoff+0x6e/0xe0

stack backtrace:
CPU: 2 PID: 10658 Comm: trinity-c236 Tainted: G W 3.14.0-rc4-next-20140228-sasha-00011-g4077c67-dirty #26
0000000000000000 ffff88011911fa48 ffffffff8438e945 0000000000000000
0000000000000000 ffff88011911fa98 ffffffff811a0109 ffff88011911fab8
ffff88011911fab8 ffff88011911fa98 ffff880119128cc0 ffff880119128cf8
Call Trace:
[<lib/dump_stack.c:52>] dump_stack+0x52/0x7f
[<kernel/locking/lockdep.c:1213>] print_circular_bug+0x129/0x160
[<kernel/locking/lockdep.c:1840>] check_prev_add+0x13f/0x560
[<include/linux/spinlock.h:343 mm/slub.c:1933>] ? deactivate_slab+0x511/0x550
[<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
[<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
[<mm/mmap.c:1552>] ? mmap_region+0x24a/0x5c0
[<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
[<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
[<kernel/locking/mutex.c:470 kernel/locking/mutex.c:571>] mutex_lock_nested+0x6a/0x510
[<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
[<kernel/sched/core.c:2477>] ? get_parent_ip+0x11/0x50
[<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
[<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120
[<mm/mmap.c:1573>] mmap_region+0x310/0x5c0
[<mm/mmap.c:1365>] do_mmap_pgoff+0x385/0x430
[<mm/util.c:397>] ? vm_mmap_pgoff+0x6e/0xe0
[<mm/util.c:399>] vm_mmap_pgoff+0x8f/0xe0
[<kernel/rcu/update.c:97>] ? __rcu_read_unlock+0x44/0xb0
[<fs/file.c:641>] ? dup_fd+0x3c0/0x3c0
[<mm/mmap.c:1416 mm/mmap.c:1374>] SyS_mmap_pgoff+0x1b0/0x210
[<arch/x86/kernel/sys_x86_64.c:72>] SyS_mmap+0x1d/0x20
[<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2

Fix it by caching atomic_write_len in kernfs_open_file during open so
that it can be determined without accessing kernfs_ops in
kernfs_fop_write(). This restores the structure of kernfs_fop_write()
before 4d3773c4bb41 with updated @len determination logic.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
Reported-by: Sasha Levin <sasha.levin@xxxxxxxxxx>
References: http://lkml.kernel.org/g/53113485.2090407@xxxxxxxxxx
---
fs/kernfs/file.c | 63 ++++++++++++++++++++++++-------------------------
include/linux/kernfs.h | 1
2 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ddcb471..8034706 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -253,55 +253,50 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
{
struct kernfs_open_file *of = kernfs_of(file);
const struct kernfs_ops *ops;
- char *buf = NULL;
- ssize_t len;
-
- /*
- * @of->mutex nests outside active ref and is just to ensure that
- * the ops aren't called concurrently for the same open file.
- */
- mutex_lock(&of->mutex);
- if (!kernfs_get_active(of->kn)) {
- mutex_unlock(&of->mutex);
- return -ENODEV;
- }
-
- ops = kernfs_ops(of->kn);
- if (!ops->write) {
- len = -EINVAL;
- goto out_unlock;
- }
+ size_t len;
+ char *buf;

- if (ops->atomic_write_len) {
+ if (of->atomic_write_len) {
len = count;
- if (len > ops->atomic_write_len) {
- len = -E2BIG;
- goto out_unlock;
- }
+ if (len > of->atomic_write_len)
+ return -E2BIG;
} else {
len = min_t(size_t, count, PAGE_SIZE);
}

buf = kmalloc(len + 1, GFP_KERNEL);
- if (!buf) {
- len = -ENOMEM;
- goto out_unlock;
- }
+ if (!buf)
+ return -ENOMEM;

if (copy_from_user(buf, user_buf, len)) {
len = -EFAULT;
- goto out_unlock;
+ goto out_free;
}
buf[len] = '\0'; /* guarantee string termination */

- len = ops->write(of, buf, len, *ppos);
-out_unlock:
+ /*
+ * @of->mutex nests outside active ref and is just to ensure that
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+ if (!kernfs_get_active(of->kn)) {
+ mutex_unlock(&of->mutex);
+ len = -ENODEV;
+ goto out_free;
+ }
+
+ ops = kernfs_ops(of->kn);
+ if (ops->write)
+ len = ops->write(of, buf, len, *ppos);
+ else
+ len = -EINVAL;
+
kernfs_put_active(of->kn);
mutex_unlock(&of->mutex);

if (len > 0)
*ppos += len;
-
+out_free:
kfree(buf);
return len;
}
@@ -666,6 +661,12 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
of->file = file;

/*
+ * Write path needs to atomic_write_len outside active reference.
+ * Cache it in open_file. See kernfs_fop_write() for details.
+ */
+ of->atomic_write_len = ops->atomic_write_len;
+
+ /*
* Always instantiate seq_file even if read access doesn't use
* seq_file or is not requested. This unifies private data access
* and readable regular files are the vast majority anyway.
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 649497a..65a3e5a 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -158,6 +158,7 @@ struct kernfs_open_file {
int event;
struct list_head list;

+ size_t atomic_write_len;
bool mmapped;
const struct vm_operations_struct *vm_ops;
};
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/