Re: [PATCH v6 3/5] mm: introduce mmap3 for safely defining new mmap flags

From: Jan Kara
Date: Thu Aug 24 2017 - 09:05:05 EST


On Wed 23-08-17 16:48:51, Dan Williams wrote:
> The mmap(2) syscall suffers from the ABI anti-pattern of not validating
> unknown flags. However, proposals like MAP_SYNC and MAP_DIRECT need a
> mechanism to define new behavior that is known to fail on older kernels
> without the support. Define a new mmap3 syscall that checks for
> unsupported flags at syscall entry and add a 'mmap_supported_mask' to
> 'struct file_operations' so generic code can validate the ->mmap()
> handler knows about the specified flags. This also arranges for the
> flags to be passed to the handler so it can do further local validation
> if the requested behavior can be fulfilled.
>
> Cc: Jan Kara <jack@xxxxxxx>
> Cc: Arnd Bergmann <arnd@xxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Suggested-by: Andy Lutomirski <luto@xxxxxxxxxx>
> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>

OK, are we sold on this approach to introduction of new mmap flags? I'm
asking because working API for mmap flag is basically the only thing that's
missing from my MAP_SYNC patches so I'd like to rebase my patches onto
something that is working...

Honza


> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> include/linux/fs.h | 1 +
> include/linux/mm.h | 2 +-
> include/linux/mman.h | 42 ++++++++++++++++++++++++++++++++
> include/linux/syscalls.h | 3 ++
> mm/mmap.c | 32 ++++++++++++++++++++++--
> 7 files changed, 78 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 448ac2161112..0618b5b38b45 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -391,3 +391,4 @@
> 382 i386 pkey_free sys_pkey_free
> 383 i386 statx sys_statx
> 384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
> +385 i386 mmap3 sys_mmap_pgoff_strict
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 5aef183e2f85..e204c736d7e9 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -339,6 +339,7 @@
> 330 common pkey_alloc sys_pkey_alloc
> 331 common pkey_free sys_pkey_free
> 332 common statx sys_statx
> +333 common mmap3 sys_mmap_pgoff_strict
>
> #
> # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 33d1ee8f51be..db42da9f98c4 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1674,6 +1674,7 @@ struct file_operations {
> long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
> long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
> int (*mmap) (struct file *, struct vm_area_struct *, unsigned long);
> + unsigned long mmap_supported_mask;
> int (*open) (struct inode *, struct file *);
> int (*flush) (struct file *, fl_owner_t id);
> int (*release) (struct inode *, struct file *);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 46b9ac5e8569..49eef48da4b7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2090,7 +2090,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
>
> extern unsigned long mmap_region(struct file *file, unsigned long addr,
> unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> - struct list_head *uf);
> + struct list_head *uf, unsigned long flags);
> extern unsigned long do_mmap(struct file *file, unsigned long addr,
> unsigned long len, unsigned long prot, unsigned long flags,
> vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index c8367041fafd..64b6cb3dec70 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -7,6 +7,48 @@
> #include <linux/atomic.h>
> #include <uapi/linux/mman.h>
>
> +/*
> + * Arrange for undefined architecture specific flags to be rejected by
> + * default.
> + */
> +#ifndef MAP_32BIT
> +#define MAP_32BIT 0
> +#endif
> +#ifndef MAP_HUGE_2MB
> +#define MAP_HUGE_2MB 0
> +#endif
> +#ifndef MAP_HUGE_1GB
> +#define MAP_HUGE_1GB 0
> +#endif
> +#ifndef MAP_UNINITIALIZED
> +#define MAP_UNINITIALIZED 0
> +#endif
> +
> +/*
> + * The historical set of flags that all mmap implementations implicitly
> + * support when file_operations.mmap_supported_mask is zero. With the
> + * mmap3 syscall the deprecated MAP_DENYWRITE and MAP_EXECUTABLE bit
> + * values are explicitly rejected with EOPNOTSUPP rather than being
> + * silently accepted.
> + */
> +#define LEGACY_MAP_SUPPORTED_MASK (MAP_SHARED \
> + | MAP_PRIVATE \
> + | MAP_FIXED \
> + | MAP_ANONYMOUS \
> + | MAP_UNINITIALIZED \
> + | MAP_GROWSDOWN \
> + | MAP_LOCKED \
> + | MAP_NORESERVE \
> + | MAP_POPULATE \
> + | MAP_NONBLOCK \
> + | MAP_STACK \
> + | MAP_HUGETLB \
> + | MAP_32BIT \
> + | MAP_HUGE_2MB \
> + | MAP_HUGE_1GB)
> +
> +#define MAP_SUPPORTED_MASK (LEGACY_MAP_SUPPORTED_MASK)
> +
> extern int sysctl_overcommit_memory;
> extern int sysctl_overcommit_ratio;
> extern unsigned long sysctl_overcommit_kbytes;
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 3cb15ea48aee..c0e0c99cf4ad 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -858,6 +858,9 @@ asmlinkage long sys_perf_event_open(
> asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
> unsigned long prot, unsigned long flags,
> unsigned long fd, unsigned long pgoff);
> +asmlinkage long sys_mmap_pgoff_strict(unsigned long addr, unsigned long len,
> + unsigned long prot, unsigned long flags,
> + unsigned long fd, unsigned long pgoff);
> asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
> asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
> struct file_handle __user *handle,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 744faae86781..386706831d67 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1464,7 +1464,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> vm_flags |= VM_NORESERVE;
> }
>
> - addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
> + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf, flags);
> if (!IS_ERR_VALUE(addr) &&
> ((vm_flags & VM_LOCKED) ||
> (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
> @@ -1521,6 +1521,32 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
> return retval;
> }
>
> +SYSCALL_DEFINE6(mmap_pgoff_strict, unsigned long, addr, unsigned long, len,
> + unsigned long, prot, unsigned long, flags,
> + unsigned long, fd, unsigned long, pgoff)
> +{
> + if (flags & ~(MAP_SUPPORTED_MASK))
> + return -EOPNOTSUPP;
> +
> + if (!(flags & MAP_ANONYMOUS)) {
> + unsigned long f_supported;
> + struct file *file;
> +
> + audit_mmap_fd(fd, flags);
> + file = fget(fd);
> + if (!file)
> + return -EBADF;
> + f_supported = file->f_op->mmap_supported_mask;
> + fput(file);
> + if (!f_supported)
> + f_supported = LEGACY_MAP_SUPPORTED_MASK;
> + if (flags & ~f_supported)
> + return -EOPNOTSUPP;
> + }
> +
> + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
> +}
> +
> #ifdef __ARCH_WANT_SYS_OLD_MMAP
> struct mmap_arg_struct {
> unsigned long addr;
> @@ -1601,7 +1627,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
>
> unsigned long mmap_region(struct file *file, unsigned long addr,
> unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> - struct list_head *uf)
> + struct list_head *uf, unsigned long flags)
> {
> struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma, *prev;
> @@ -1686,7 +1712,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> * new file must not have been exposed to user-space, yet.
> */
> vma->vm_file = get_file(file);
> - error = call_mmap(file, vma, 0);
> + error = call_mmap(file, vma, flags);
> if (error)
> goto unmap_and_free_vma;
>
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR