[patch 01/10] unprivileged mounts: add user mounts to the kernel

From: Miklos Szeredi
Date: Tue Feb 05 2008 - 16:41:05 EST


From: Miklos Szeredi <mszeredi@xxxxxxx>

This patchset adds support for keeping mount ownership information in the
kernel, and allow unprivileged mount(2) and umount(2) in certain cases.

The mount owner has the following privileges:

- unmount the owned mount
- create a submount under the owned mount

The sysadmin can set the owner explicitly on mount and remount. When an
unprivileged user creates a mount, then the owner is automatically set to the
user.

The following use cases are envisioned:

1) Private namespace, with selected mounts owned by user. E.g.
/home/$USER is a good candidate for allowing unpriv mounts and unmounts
within.

2) Private namespace, with all mounts owned by user and having the "nosuid"
flag. User can mount and umount anywhere within the namespace, but suid
programs will not work.

3) Global namespace, with a designated directory, which is a mount owned by
the user. E.g. /mnt/users/$USER is set up so that it is bind mounted onto
itself, and set to be owned by $USER. The user can add/remove mounts only
under this directory.

The following extra security measures are taken for unprivileged mounts:

- usermounts are limited by a sysctl tunable
- force "nosuid,nodev" mount options on the created mount

This series increases the size of vmlinux by about 1.5k on x86_64.

For testing unprivileged mounts (and for other purposes) simple
mount/umount utilities are available from:

http://www.kernel.org/pub/linux/kernel/people/mszeredi/mmount/

A preliminary patch for util-linux-ng to add the same functionality to
mount(8) and umount(8) is available here:

http://lkml.org/lkml/2008/1/16/103


This patch:

A new mount flag, MS_SETUSER is used to make a mount owned by a user. If this
flag is specified, then the owner will be set to the current fsuid and the
mount will be marked with the MNT_USER flag. On remount don't preserve
previous owner, and treat MS_SETUSER as for a new mount. The MS_SETUSER flag
is ignored on mount move.

The MNT_USER flag is not copied on any kind of mount cloning: namespace
creation, binding or propagation. For bind mounts the cloned mount(s) are set
to MNT_USER depending on the MS_SETUSER mount flag. In all the other cases
MNT_USER is always cleared.

For MNT_USER mounts a "user=UID" option is added to /proc/PID/mounts. This is
compatible with how mount ownership is stored in /etc/mtab.

The rationale for using MS_SETUSER and MNT_USER, to distinguish "user"
mounts from "non-user" or "legacy" mounts are follows:

a) Mount(2) and umount(2) on legacy mounts always need CAP_SYS_ADMIN
capability. As opposed to user mounts, which will only require,
that the mount owner matches the current fsuid. So a process
with fsuid=0 should not be able to mount/umount legacy mounts
without the CAP_SYS_ADMIN capability.

b) Legacy userspace programs may set fsuid to nonzero before calling
mount(2). In such an unlikely case, this patchset would cause
an unintended side effect of making the mount owned by the fsuid.

c) For legacy mounts, no "user=UID" option should be shown in
/proc/mounts for backwards compatibility.

Signed-off-by: Miklos Szeredi <mszeredi@xxxxxxx>
Acked-by: Serge Hallyn <serue@xxxxxxxxxx>
---

Index: linux/fs/namespace.c
===================================================================
--- linux.orig/fs/namespace.c 2008-02-04 23:47:47.000000000 +0100
+++ linux/fs/namespace.c 2008-02-04 23:47:50.000000000 +0100
@@ -511,6 +511,13 @@ static struct vfsmount *skip_mnt_tree(st
return p;
}

+static void set_mnt_user(struct vfsmount *mnt)
+{
+ WARN_ON(mnt->mnt_flags & MNT_USER);
+ mnt->mnt_uid = current->fsuid;
+ mnt->mnt_flags |= MNT_USER;
+}
+
static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
int flag)
{
@@ -525,6 +532,11 @@ static struct vfsmount *clone_mnt(struct
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;

+ /* don't copy the MNT_USER flag */
+ mnt->mnt_flags &= ~MNT_USER;
+ if (flag & CL_SETUSER)
+ set_mnt_user(mnt);
+
if (flag & CL_SLAVE) {
list_add(&mnt->mnt_slave, &old->mnt_slave_list);
mnt->mnt_master = old;
@@ -712,6 +724,8 @@ static void show_mnt_opts(struct seq_fil
if (mnt->mnt_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}
+ if (mnt->mnt_flags & MNT_USER)
+ seq_printf(m, ",user=%i", mnt->mnt_uid);
}

static void show_type(struct seq_file *m, struct super_block *sb)
@@ -1320,8 +1334,9 @@ static int do_change_type(struct nameida
/*
* do loopback mount.
*/
-static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
+static int do_loopback(struct nameidata *nd, char *old_name, int flags)
{
+ int clone_fl;
struct nameidata old_nd;
struct vfsmount *mnt = NULL;
int err = mount_is_safe(nd);
@@ -1341,11 +1356,12 @@ static int do_loopback(struct nameidata
if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt))
goto out;

+ clone_fl = (flags & MS_SETUSER) ? CL_SETUSER : 0;
err = -ENOMEM;
- if (recurse)
- mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0);
+ if (flags & MS_REC)
+ mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, clone_fl);
else
- mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0);
+ mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, clone_fl);

if (!mnt)
goto out;
@@ -1407,8 +1423,11 @@ static int do_remount(struct nameidata *
err = change_mount_flags(nd->path.mnt, flags);
else
err = do_remount_sb(sb, flags, data, 0);
- if (!err)
+ if (!err) {
nd->path.mnt->mnt_flags = mnt_flags;
+ if (flags & MS_SETUSER)
+ set_mnt_user(nd->path.mnt);
+ }
up_write(&sb->s_umount);
if (!err)
security_sb_post_remount(nd->path.mnt, flags, data);
@@ -1517,10 +1536,13 @@ static int do_new_mount(struct nameidata
if (!capable(CAP_SYS_ADMIN))
return -EPERM;

- mnt = do_kern_mount(type, flags, name, data);
+ mnt = do_kern_mount(type, flags & ~MS_SETUSER, name, data);
if (IS_ERR(mnt))
return PTR_ERR(mnt);

+ if (flags & MS_SETUSER)
+ set_mnt_user(mnt);
+
return do_add_mount(mnt, nd, mnt_flags, NULL);
}

@@ -1552,7 +1574,8 @@ int do_add_mount(struct vfsmount *newmnt
if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
goto unlock;

- newmnt->mnt_flags = mnt_flags;
+ /* MNT_USER was set earlier */
+ newmnt->mnt_flags |= mnt_flags;
if ((err = graft_tree(newmnt, nd)))
goto unlock;

@@ -1874,7 +1897,7 @@ long do_mount(char *dev_name, char *dir_
retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(&nd, dev_name, flags);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
Index: linux/fs/pnode.h
===================================================================
--- linux.orig/fs/pnode.h 2008-02-04 23:47:47.000000000 +0100
+++ linux/fs/pnode.h 2008-02-04 23:47:50.000000000 +0100
@@ -23,6 +23,7 @@
#define CL_MAKE_SHARED 0x08
#define CL_PROPAGATION 0x10
#define CL_PRIVATE 0x20
+#define CL_SETUSER 0x40

static inline void set_mnt_shared(struct vfsmount *mnt)
{
Index: linux/include/linux/fs.h
===================================================================
--- linux.orig/include/linux/fs.h 2008-02-04 23:47:47.000000000 +0100
+++ linux/include/linux/fs.h 2008-02-04 23:47:50.000000000 +0100
@@ -125,6 +125,7 @@ extern int dir_notify_enable;
#define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */
#define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */
#define MS_I_VERSION (1<<23) /* Update inode I_version field */
+#define MS_SETUSER (1<<24) /* set mnt_uid to current user */
#define MS_ACTIVE (1<<30)
#define MS_NOUSER (1<<31)

Index: linux/include/linux/mount.h
===================================================================
--- linux.orig/include/linux/mount.h 2008-02-04 23:47:47.000000000 +0100
+++ linux/include/linux/mount.h 2008-02-04 23:47:50.000000000 +0100
@@ -33,6 +33,7 @@ struct mnt_namespace;

#define MNT_SHRINKABLE 0x100
#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
+#define MNT_USER 0x400

#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
@@ -70,6 +71,8 @@ struct vfsmount {
* are held, and all mnt_writer[]s on this mount have 0 as their ->count
*/
atomic_t __mnt_writers;
+
+ uid_t mnt_uid; /* owner of the mount */
};

static inline struct vfsmount *mntget(struct vfsmount *mnt)

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/