[PATCH 2/3] vfs: ->mmap_prepare()

From: Peter Zijlstra
Date: Wed Nov 14 2007 - 15:17:55 EST


Some filesystems (NFS) need i_mutex in ->mmap(), this violates the normal
locking order. Provide a hook before we take mmap_sem.

This leaves a window between ->mmap_prepare() and ->mmap(), if thats a problem
(Trond?) we could also provide ->mmap_finish() and guarantee it being called
if ->mmap_prepare() returned success.

This would allow holding state and thereby close the window.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
---
Documentation/filesystems/Locking | 11 ++++++++++-
Documentation/filesystems/vfs.txt | 3 +++
include/linux/fs.h | 1 +
ipc/shm.c | 13 +++++++++++++
mm/mmap.c | 12 ++++++++++++
mm/nommu.c | 12 ++++++++++++
6 files changed, 51 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/fs.h
===================================================================
--- linux-2.6.orig/include/linux/fs.h
+++ linux-2.6/include/linux/fs.h
@@ -1172,6 +1172,7 @@ struct file_operations {
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+ int (*mmap_prepare) (struct file *, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c
+++ linux-2.6/mm/mmap.c
@@ -1035,6 +1035,12 @@ unsigned long do_mmap_pgoff(struct file
struct mm_struct *mm = current->mm;
unsigned long ret;

+ if (file && file->f_op && file->f_op->mmap_prepare) {
+ ret = file->f_op->mmap_prepare(file, len, prot, flags, pgoff);
+ if (ret)
+ return ret;
+ }
+
down_write(&mm->mmap_sem);
ret = ___do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up_write(&mm->mmap_sem);
@@ -1054,6 +1060,12 @@ unsigned long do_mmap(struct file *file,
if ((offset + PAGE_ALIGN(len)) < offset || (offset & ~PAGE_MASK))
return ret;

+ if (file && file->f_op && file->f_op->mmap_prepare) {
+ ret = file->f_op->mmap_prepare(file, len, prot, flags, pgoff);
+ if (ret)
+ return ret;
+ }
+
down_write(&mm->mmap_sem);
ret = ___do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up_write(&mm->mmap_sem);
Index: linux-2.6/mm/nommu.c
===================================================================
--- linux-2.6.orig/mm/nommu.c
+++ linux-2.6/mm/nommu.c
@@ -1025,6 +1025,12 @@ unsigned long do_mmap_pgoff(struct file
struct mm_struct *mm = current->mm;
unsigned long ret;

+ if (file && file->f_op && file->f_op->mmap_prepare) {
+ ret = file->f_op->mmap_prepare(file, len, prot, flags, pgoff);
+ if (ret)
+ return ret;
+ }
+
down_write(&mm->mmap_sem);
ret = ___do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up_write(&mm->mmap_sem);
@@ -1044,6 +1050,12 @@ unsigned long do_mmap(struct file *file,
if ((offset + PAGE_ALIGN(len)) < offset || (offset & ~PAGE_MASK))
return ret;

+ if (file && file->f_op && file->f_op->mmap_prepare) {
+ ret = file->f_op->mmap_prepare(file, len, prot, flags, pgoff);
+ if (ret)
+ return ret;
+ }
+
down_write(&mm->mmap_sem);
ret = ___do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
up_write(&mm->mmap_sem);
Index: linux-2.6/ipc/shm.c
===================================================================
--- linux-2.6.orig/ipc/shm.c
+++ linux-2.6/ipc/shm.c
@@ -300,6 +300,12 @@ static int shm_mmap(struct file * file,
struct shm_file_data *sfd = shm_file_data(file);
int ret;

+ /*
+ * SHM backing filesystems may not have mmap_prepare!
+ * See so_shmat().
+ */
+ WARN_ON(sfd->file->f_op->mmap_prepare);
+
ret = sfd->file->f_op->mmap(sfd->file, vma);
if (ret != 0)
return ret;
@@ -1012,6 +1018,13 @@ long do_shmat(int shmid, char __user *sh
goto invalid;
}

+ /*
+ * The usage of ___do_mmap_locked() is needed because we must already
+ * hold the mmap_sem here due to find_vma_intersection vs mmap races.
+ *
+ * This prohibits in SHM backing filesystems from using
+ * f_op->mmap_prepare().
+ */
user_addr = ___do_mmap_pgoff (file, addr, size, prot, flags, 0);
*raddr = user_addr;
err = 0;
Index: linux-2.6/Documentation/filesystems/Locking
===================================================================
--- linux-2.6.orig/Documentation/filesystems/Locking
+++ linux-2.6/Documentation/filesystems/Locking
@@ -378,6 +378,8 @@ prototypes:
unsigned long);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+ int (*mmap_prepare) (struct file *, unsigned long len, unsigned long prot,
+ unsigned long flags, unsigned long pgoff);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
@@ -413,7 +415,8 @@ poll: no
ioctl: yes (see below)
unlocked_ioctl: no (see below)
compat_ioctl: no
-mmap: no
+mmap_prepare: no (see below)
+mmap: no (see below)
open: maybe (see below)
flush: no
release: no
@@ -436,6 +439,12 @@ For many filesystems, it is probably saf
semaphore. Note some filesystems (i.e. remote ones) provide no
protection for i_size so you will need to use the BKL.

+->mmap_prepare() is called on mmap(2) _before_ acquisition of the mmap_sem,
+filesystems can use this hook to prepare the file for being mapped, and can
+take i_mutex if they need to.
+
+->mmap() is called while the mmap_sem is held.
+
->open() locking is in-transit: big lock partially moved into the methods.
The only exception is ->open() in the instances of file_operations that never
end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices
Index: linux-2.6/Documentation/filesystems/vfs.txt
===================================================================
--- linux-2.6.orig/Documentation/filesystems/vfs.txt
+++ linux-2.6/Documentation/filesystems/vfs.txt
@@ -762,6 +762,7 @@ struct file_operations {
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
+ int (*mmap_prepare) (struct file *, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff);
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *);
@@ -809,6 +810,8 @@ otherwise noted.
compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
are used on 64 bit kernels.

+ mmap_prepare: called by the mmap(2) system call
+
mmap: called by the mmap(2) system call

open: called by the VFS when an inode should be opened. When the VFS

--

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/